# # Derived from source code of TrueCrypt 7.1a, which is # Copyright (c) 2008-2012 TrueCrypt Developers Association and which is governed # by the TrueCrypt License 3.0. # # Modifications and additions to the original source code (contained in this file) # and all other portions of this file are Copyright (c) 2013-2017 IDRIX # and are governed by the Apache License 2.0 the full text of which is # contained in the file License.txt included in VeraCrypt binary and source # code distribution packages. # #------ Command line arguments ------ # DEBUG: Disable optimizations and enable debugging checks # DEBUGGER: Enable debugging information for use by debuggers # NOASM: Exclude modules requiring assembler # NOGUI: Disable graphical user interface (build console-only application) # NOSTRIP: Do not strip release binary # NOTEST: Do not test release binary # RESOURCEDIR: Run-time resource directory # VERBOSE: Enable verbose messages # WXSTATIC: Use static wxWidgets library # SSSE3: Enable SSSE3 support in compiler # SSE41: Enable SSE4.1 support in compiler # NOSSE2: Disable SEE2 support in compiler # WITHGTK3: Build wxWidgets against GTK3 #------ Targets ------ # all # clean # wxbuild: Configure and build wxWidgets - source code must be located at $(WX_ROOT) #------ Build configuration ------ export APPNAME := veracrypt export BASE_DIR := $(CURDIR) export BUILD_INC := $(BASE_DIR)/Build/Include export AR ?= ar export CC ?= gcc export CXX ?= g++ export AS := yasm export RANLIB ?= ranlib export CFLAGS := -Wall export CXXFLAGS := -Wall -Wno-unused-parameter C_CXX_FLAGS := -MMD -D_FILE_OFFSET_BITS=64 -D_LARGEFILE_SOURCE -D_LARGE_FILES -I$(BASE_DIR) -I$(BASE_DIR)/Crypto export ASFLAGS := -D __GNUC__ -D __YASM__ export LFLAGS := export PKG_CONFIG_PATH ?= /usr/local/lib/pkgconfig export WX_CONFIG ?= wx-config export WX_CONFIG_ARGS := --unicode WX_CONFIGURE_FLAGS := export WXCONFIG_CFLAGS := export WXCONFIG_CXXFLAGS := WX_ROOT ?= $(BASE_DIR)/wxWidgets export TC_BUILD_CONFIG := Release ifeq "$(origin DEBUG)" "command line" ifneq "$(DEBUG)" "0" TC_BUILD_CONFIG := Debug endif endif ifeq "$(origin NOGUI)" "command line" export TC_NO_GUI := 1 C_CXX_FLAGS += -DTC_NO_GUI -DwxUSE_GUI=0 WX_CONFIGURE_FLAGS += --disable-gui endif ifdef PKCS11_INC C_CXX_FLAGS += -I$(PKCS11_INC) else C_CXX_FLAGS += -I$(CURDIR)/PKCS11 endif ifeq "$(origin RESOURCEDIR)" "command line" C_CXX_FLAGS += -DTC_RESOURCE_DIR="$(RESOURCEDIR)" endif ifneq "$(origin VERBOSE)" "command line" MAKEFLAGS += -s endif ifeq "$(origin WXSTATIC)" "command line" export VC_WX_STATIC := 1 WX_CONFIG = $(WX_BUILD_DIR)/wx-config WX_CONFIG_ARGS += --static ifneq "$(WXSTATIC)" "FULL" export VC_WX_MINIMAL := 1 endif endif ifeq "$(origin INDICATOR)" "command line" ifneq (,$(findstring gtk3,$(shell $(WX_CONFIG) --selected-config))) INDICATOR_LIBRARY=ayatana-appindicator3-0.1 else INDICATOR_LIBRARY=ayatana-appindicator-0.1 endif export LIBS += $(shell pkg-config --libs $(INDICATOR_LIBRARY)) C_CXX_FLAGS += $(shell pkg-config --cflags $(INDICATOR_LIBRARY)) -DHAVE_INDICATORS endif #------ Release configuration ------ ifeq "$(TC_BUILD_CONFIG)" "Release" C_CXX_FLAGS += -O2 -fno-strict-aliasing # Do not enable strict aliasing export WX_BUILD_DIR ?= $(BASE_DIR)/wxrelease WX_CONFIGURE_FLAGS += --disable-debug_flag --disable-debug_gdb --disable-debug_info else #------ Debug configuration ------ C_CXX_FLAGS += -DDEBUG CXXFLAGS += -fno-default-inline -Wno-unused-function -Wno-unused-variable export WX_BUILD_DIR ?= $(BASE_DIR)/wxdebug WX_CONFIGURE_FLAGS += --enable-debug_flag --disable-debug_gdb --disable-debug_info endif #------ Debugger configuration ------ ifeq "$(origin DEBUGGER)" "command line" C_CXX_FLAGS += -ggdb WX_CONFIGURE_FLAGS += --enable-debug_gdb --enable-debug_info endif #------ Platform configuration ------ export PLATFORM := "Unknown" export PLATFORM_ARCH := "Unknown" export PLATFORM_UNSUPPORTED := 0 export CPU_ARCH ?= unknown export SIMD_SUPPORTED := 0 export DISABLE_AESNI ?= 0 export GCC_GTEQ_440 := 0 export GCC_GTEQ_430 := 0 ARCH ?= $(shell uname -m) ifneq (,$(filter i386 i486 i586 i686 x86,$(ARCH))) CPU_ARCH = x86 ASFLAGS += -f elf32 -D __BITS__=32 else ifneq (,$(filter x86_64 x86-64 amd64 x64,$(ARCH))) CPU_ARCH = x64 ASFLAGS += -f elf64 -D __BITS__=64 else ifneq (,$(filter armv7l,$(ARCH))) PLATFORM_ARCH := armv7 CPU_ARCH = armv7 endif ifeq "$(origin NOASM)" "command line" CPU_ARCH = unknown C_CXX_FLAGS += -DCRYPTOPP_DISABLE_X86ASM endif ifeq "$(CPU_ARCH)" "x86" PLATFORM_ARCH := i386 SIMD_SUPPORTED := 1 C_CXX_FLAGS += -D TC_ARCH_X86 else ifeq "$(CPU_ARCH)" "x64" PLATFORM_ARCH := amd64 SIMD_SUPPORTED := 1 C_CXX_FLAGS += -D TC_ARCH_X64 endif ifeq "$(origin NOSSE2)" "command line" SIMD_SUPPORTED := 0 endif ifeq "$(origin NOAESNI)" "command line" DISABLE_AESNI := 1 endif #------ Linux configuration ------ ifeq "$(shell uname -s)" "Linux" PLATFORM := Linux C_CXX_FLAGS += -DTC_UNIX -DTC_LINUX # GNU GCC version 11 and higher compile with -std=gnu++17 by default # which breaks "byte" definitions in Crypto++ library. So set # -std=gnu++14 instead. GCC11PLUS := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 11) ifeq "$(GCC11PLUS)" "1" CXXFLAGS += -std=gnu++14 endif ifeq "$(SIMD_SUPPORTED)" "1" CFLAGS += -msse2 CXXFLAGS += -msse2 GCC_GTEQ_440 := $(shell expr `$(CC) -dumpversion | sed -e 's/\.\([0-9][0-9]\)/\1/g' -e 's/\.\([0-9]\)/0\1/g' -e 's/^[0-9]\{3,4\}$$/&00/' -e 's/^[0-9]\{1,2\}$$/&0000/'` \>= 40400) GCC_GTEQ_430 := $(shell expr `$(CC) -dumpversion | sed -e 's/\.\([0-9][0-9]\)/\1/g' -e 's/\.\([0-9]\)/0\1/g' -e 's/^[0-9]\{3,4\}$$/&00/' -e 's/^[0-9]\{1,2\}$$/&0000/'` \>= 40300) ifeq "$(DISABLE_AESNI)" "1" CFLAGS += -mno-aes -DCRYPTOPP_DISABLE_AESNI CXXFLAGS += -mno-aes -DCRYPTOPP_DISABLE_AESNI else ifeq "$(GCC_GTEQ_440)" "1" CFLAGS += -maes CXXFLAGS += -maes endif endif ifeq "$(GCC_GTEQ_430)" "1" ifeq "$(origin SSSE3)" "command line" CFLAGS += -mssse3 CXXFLAGS += -mssse3 endif ifeq "$(origin SSE41)" "command line" CFLAGS += -mssse3 -msse4.1 CXXFLAGS += -mssse3 -msse4.1 endif endif endif ifeq "$(TC_BUILD_CONFIG)" "Release" C_CXX_FLAGS += -fdata-sections -ffunction-sections -fpie LFLAGS += -Wl,--gc-sections -pie ifneq "$(shell ld --help 2>&1 | grep sysv | wc -l)" "0" LFLAGS += -Wl,--hash-style=sysv endif WXCONFIG_CFLAGS += -fdata-sections -ffunction-sections -fpie WXCONFIG_CXXFLAGS += -fdata-sections -ffunction-sections -fpie endif ifneq "$(origin WXSTATIC)" "command line" LFLAGS += -ldl else GCC5USED := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 5) ifeq "$(GCC5USED)" "1" CXXFLAGS += -D_GLIBCXX_USE_CXX11_ABI=0 WXCONFIG_CXXFLAGS += -D_GLIBCXX_USE_CXX11_ABI=0 endif endif ifeq "$(origin NOSSE2)" "command line" CFLAGS += -mno-sse2 CXXFLAGS += -mno-sse2 WXCONFIG_CFLAGS += -mno-sse2 WXCONFIG_CXXFLAGS += -mno-sse2 endif ifeq "$(origin WITHGTK3)" "command line" WX_CONFIGURE_FLAGS += --with-gtk=3 endif endif #------ Mac OS X configuration ------ ifeq "$(shell uname -s)" "Darwin" PLATFORM := MacOSX APPNAME := VeraCrypt export VC_OSX_TARGET ?= 10.7 export VC_OSX_SDK ?= $(VC_OSX_TARGET) #check to see if XCode 3 path exists.Otherwise, use XCode 4 path VC_OSX_SDK_PATH := /Developer/SDKs/MacOSX$(VC_OSX_SDK).sdk ifeq ($(wildcard $(VC_OSX_SDK_PATH)/SDKSettings.plist),) VC_OSX_SDK_PATH := /Applications/Xcode.app/Contents/Developer/Platforms/MacOSX.platform/Developer/SDKs/MacOSX$(VC_OSX_SDK).sdk endif #----- Legacy build if OSX <= 10.8: we build both 32-bit and 64-bit ---- ifneq (,$(filter 10.6 10.7 10.8,$(VC_OSX_TARGET))) export VC_LEGACY_BUILD := 1 endif CC := gcc CXX := g++ C_CXX_FLAGS += -DTC_UNIX -DTC_BSD -DTC_MACOSX -mmacosx-version-min=$(VC_OSX_TARGET) -isysroot $(VC_OSX_SDK_PATH) LFLAGS += -mmacosx-version-min=$(VC_OSX_TARGET) -Wl,-syslibroot $(VC_OSX_SDK_PATH) WX_CONFIGURE_FLAGS += --with-macosx-version-min=$(VC_OSX_TARGET) --with-macosx-sdk=$(VC_OSX_SDK_PATH) ifeq "$(CPU_ARCH)" "x64" CPU_ARCH = x86 endif ifeq "$(CPU_ARCH)" "arm64" CPU_ARCH = x86 endif CFLAGS += -msse2 CXXFLAGS += -msse2 ifeq "$(origin SSSE3)" "command line" CFLAGS += -mssse3 CXXFLAGS += -mssse3 endif ifeq "$(origin SSE41)" "command line" CFLAGS += -mssse3 -msse4.1 CXXFLAGS += -mssse3 -msse4.1 endif AS := $(BASE_DIR)/Build/Tools/MacOSX/yasm export ASFLAGS32 := -D __GNUC__ -D __YASM__ -D __BITS__=32 --prefix=_ -f macho32 export ASFLAGS64 := -D __GNUC__ -D __YASM__ -D __BITS__=64 --prefix=_ -f macho64 ifeq "$(TC_BUILD_CONFIG)" "Release" export DISABLE_PRECOMPILED_HEADERS := 1 S := $(C_CXX_FLAGS) C_CXX_FLAGS = $(subst -MMD,,$(S)) C_CXX_FLAGS += -gfull -arch x86_64 LFLAGS += -Wl,-dead_strip -arch x86_64 WX_CONFIGURE_FLAGS += --without-libpng --disable-gif --disable-pcx --disable-tga --disable-iff --disable-gif --disable-svg #----- Legacy build: we build both 32-bit and 64-bit ---- ifdef VC_LEGACY_BUILD C_CXX_FLAGS += -arch i386 LFLAGS += -arch i386 WX_CONFIGURE_FLAGS += --enable-universal_binary=i386,x86_64 else CXXFLAGS += -std=c++11 C_CXX_FLAGS += -arch arm64 LFLAGS += -arch arm64 WX_CONFIGURE_FLAGS += --enable-universal_binary=arm64,x86_64 endif WXCONFIG_CFLAGS += -gfull WXCONFIG_CXXFLAGS += -gfull else WX_CONFIGURE_FLAGS += --disable-universal_binary endif endif #------ FreeBSD configuration ------ ifeq "$(shell uname -s)" "FreeBSD" PLATFORM := FreeBSD PLATFORM_UNSUPPORTED := 1 C_CXX_FLAGS += -DTC_UNIX -DTC_BSD -DTC_FREEBSD CC := cc CXX := c++ ifeq "$(TC_BUILD_CONFIG)" "Release" C_CXX_FLAGS += -fdata-sections -ffunction-sections -fpie LFLAGS += -Wl,--gc-sections -pie ifneq "$(shell ld --help 2>&1 | grep sysv | wc -l)" "0" LFLAGS += -Wl,--hash-style=sysv endif WXCONFIG_CFLAGS += -fpie -fPIC WXCONFIG_CXXFLAGS += -fpie -fPIC endif ifeq "$(SIMD_SUPPORTED)" "1" CFLAGS += -msse2 CXXFLAGS += -msse2 ifeq "$(DISABLE_AESNI)" "1" CFLAGS += -mno-aes -DCRYPTOPP_DISABLE_AESNI CXXFLAGS += -mno-aes -DCRYPTOPP_DISABLE_AESNI else CFLAGS += -maes CXXFLAGS += -maes endif ifeq "$(origin SSSE3)" "command line" CFLAGS += -mssse3 CXXFLAGS += -mssse3 endif ifeq "$(origin SSE41)" "command line" CFLAGS += -mssse3 -msse4.1 CXXFLAGS += -mssse3 -msse4.1 endif endif ifeq "$(origin NOSSE2)" "command line" CFLAGS += -mno-sse2 CXXFLAGS += -mno-sse2 WXCONFIG_CFLAGS += -mno-sse2 WXCONFIG_CXXFLAGS += -mno-sse2 endif endif #------ OpenBSD configuration ------ ifeq "$(shell uname -s)" "OpenBSD" PLATFORM := OpenBSD PLATFORM_UNSUPPORTED := 1 C_CXX_FLAGS += -DTC_UNIX -DTC_BSD -DTC_OPENBSD CC := cc CXX := c++ ifeq "$(TC_BUILD_CONFIG)" "Release" C_CXX_FLAGS += -fdata-sections -ffunction-sections -fpie LFLAGS += -Wl,--gc-sections -pie WXCONFIG_CFLAGS += -fpie -fPIC WXCONFIG_CXXFLAGS += -fpie -fPIC endif endif #------ Solaris configuration ------ ifeq "$(shell uname -s)" "SunOS" PLATFORM := Solaris PLATFORM_UNSUPPORTED := 1 C_CXX_FLAGS += -DTC_UNIX -DTC_SOLARIS WX_CONFIGURE_FLAGS += --with-gtk endif #------ Common configuration ------ CFLAGS := $(C_CXX_FLAGS) $(CFLAGS) $(TC_EXTRA_CFLAGS) CXXFLAGS := $(C_CXX_FLAGS) $(CXXFLAGS) $(TC_EXTRA_CXXFLAGS) LFLAGS := $(LFLAGS) $(TC_EXTRA_LFLAGS) WX_CONFIGURE_FLAGS += --enable-unicode -disable-shared --disable-dependency-tracking --enable-exceptions --enable-std_string --enable-dataobj --enable-mimetype ifdef VC_WX_MINIMAL WX_CONFIGURE_FLAGS += --disable-protocol --disable-protocols --disable-url --disable-ipc --disable-sockets --disable-fs_inet --disable-ole --disable-docview --disable-clipboard \ --disable-help --disable-html --disable-mshtmlhelp --disable-htmlhelp --disable-mdi --disable-metafile --disable-webkit --disable-webview \ --disable-xrc --disable-aui --disable-postscript --disable-printarch \ --disable-arcstream --disable-fs_archive --disable-fs_zip --disable-tarstream --disable-zipstream \ --disable-animatectrl --disable-bmpcombobox --disable-calendar --disable-caret --disable-checklst --disable-collpane --disable-colourpicker --disable-comboctrl \ --disable-datepick --disable-display --disable-dirpicker --disable-filepicker --disable-fontpicker --disable-grid --disable-dataviewctrl \ --disable-listbook --disable-odcombobox --disable-sash --disable-searchctrl --disable-slider --disable-splitter --disable-togglebtn \ --disable-toolbar --disable-tbarnative --disable-treebook --disable-toolbook --disable-tipwindow --disable-popupwin \ --disable-commondlg --disable-aboutdlg --disable-coldlg --disable-finddlg --disable-fontdlg --disable-numberdlg --disable-splash \ --disable-tipdlg --disable-progressdlg --disable-wizarddlg --disable-miniframe --disable-splines --disable-palette \ --disable-richtext --disable-dialupman --disable-debugreport --disable-filesystem --disable-rearrangectrl --disable-treelist --disable-richmsgdlg \ --disable-richtooltip --disable-propgrid --disable-stc --without-libnotify \ --without-gtkprint --without-gnomevfs --disable-fsvolume --disable-fswatcher \ --disable-sound --disable-mediactrl --disable-joystick --disable-apple_ieee \ --disable-gif --disable-pcx --disable-tga --disable-iff --disable-gif --disable-pnm --disable-svg \ --without-expat --without-libtiff --without-libjpeg --without-libpng -without-regex --without-zlib ifeq "$(PLATFORM)" "Linux" WX_CONFIGURE_FLAGS += --disable-tooltips ifneq "$(origin WITHGTK3)" "command line" WX_CONFIGURE_FLAGS += --disable-graphics_ctx endif else WX_CONFIGURE_FLAGS += --disable-graphics_ctx endif endif #------ Project build ------ PROJ_DIRS := Platform Volume Driver/Fuse Core Main .PHONY: all clean wxbuild all clean: @if pwd | grep -q ' '; then echo 'Error: source code is stored in a path containing spaces' >&2; exit 1; fi @for DIR in $(PROJ_DIRS); do \ PROJ=$$(echo $$DIR | cut -d/ -f1); \ $(MAKE) -C $$DIR -f $$PROJ.make NAME=$$PROJ $(MAKECMDGOALS) || exit $?; \ export LIBS="$(BASE_DIR)/$$DIR/$$PROJ.a $$LIBS"; \ done install: $(MAKE) -C Main -f Main.make NAME=Main install package: $(MAKE) -C Main -f Main.make NAME=Main package #------ wxWidgets build ------ ifeq "$(MAKECMDGOALS)" "wxbuild" CFLAGS := CXXFLAGS := LFLAGS := endif wxbuild: ifneq "$(shell test -f $(WX_ROOT)/configure || test -f $(WX_BUILD_DIR)/../configure && echo 1)" "1" @echo 'Error: WX_ROOT must point to wxWidgets source code directory' >&2 @exit 1 endif rm -rf "$(WX_BUILD_DIR)" mkdir -p "$(WX_BUILD_DIR)" @echo Configuring wxWidgets library... cd "$(WX_BUILD_DIR)" && "$(WX_ROOT)/configure" $(WX_CONFIGURE_FLAGS) >/dev/null @echo Building wxWidgets library... cd "$(WX_BUILD_DIR)" && $(MAKE) -j 4 /a> 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918
; ---------------------------------------------------------------------------
; Copyright (c) 1998-2007, Brian Gladman, Worcester, UK. All rights reserved.
;
; LICENSE TERMS
;
; The free distribution and use of this software is allowed (with or without
; changes) provided that:
;
;  1. source code distributions include the above copyright notice, this
;     list of conditions and the following disclaimer;
;
;  2. binary distributions include the above copyright notice, this list
;     of conditions and the following disclaimer in their documentation;
;
;  3. the name of the copyright holder is not used to endorse products
;     built using this software without specific written permission.
;
; DISCLAIMER
;
; This software is provided 'as is' with no explicit or implied warranties
; in respect of its properties, including, but not limited to, correctness
; and/or fitness for purpose.
; ---------------------------------------------------------------------------
; Issue 20/12/2007
;
; I am grateful to Dag Arne Osvik for many discussions of the techniques that
; can be used to optimise AES assembler code on AMD64/EM64T architectures.
; Some of the techniques used in this implementation are the result of
; suggestions made by him for which I am most grateful.

;
; Adapted for TrueCrypt:
; - Compatibility with NASM
;

; An AES implementation for AMD64 processors using the YASM assembler.  This
; implemetation provides only encryption, decryption and hence requires key
; scheduling support in C. It uses 8k bytes of tables but its encryption and
; decryption performance is very close to that obtained using large tables.
; It can use either Windows or Gnu/Linux calling conventions, which are as
; follows:
;               windows  gnu/linux
;
;   in_blk          rcx     rdi
;   out_blk         rdx     rsi
;   context (cx)     r8     rdx
;
;   preserved       rsi      -    + rbx, rbp, rsp, r12, r13, r14 & r15
;   registers       rdi      -      on both
;
;   destroyed        -      rsi   + rax, rcx, rdx, r8, r9, r10 & r11
;   registers        -      rdi     on both
;
; The default convention is that for windows, the gnu/linux convention being
; used if __GNUC__ is defined.
;
; Define _SEH_ to include support for Win64 structured exception handling
; (this requires YASM version 0.6 or later).
;
; This code provides the standard AES block size (128 bits, 16 bytes) and the
; three standard AES key sizes (128, 192 and 256 bits). It has the same call
; interface as my C implementation.  It uses the Microsoft C AMD64 calling
; conventions in which the three parameters are placed in  rcx, rdx and r8
; respectively.  The rbx, rsi, rdi, rbp and r12..r15 registers are preserved.
;
;     AES_RETURN aes_encrypt(const unsigned char in_blk[],
;                   unsigned char out_blk[], const aes_encrypt_ctx cx[1]);
;
;     AES_RETURN aes_decrypt(const unsigned char in_blk[],
;                   unsigned char out_blk[], const aes_decrypt_ctx cx[1]);
;
;     AES_RETURN aes_encrypt_key<NNN>(const unsigned char key[],
;                                            const aes_encrypt_ctx cx[1]);
;
;     AES_RETURN aes_decrypt_key<NNN>(const unsigned char key[],
;                                            const aes_decrypt_ctx cx[1]);
;
;     AES_RETURN aes_encrypt_key(const unsigned char key[],
;                           unsigned int len, const aes_decrypt_ctx cx[1]);
;
;     AES_RETURN aes_decrypt_key(const unsigned char key[],
;                           unsigned int len, const aes_decrypt_ctx cx[1]);
;
; where <NNN> is 128, 102 or 256.  In the last two calls the length can be in
; either bits or bytes.
;
; Comment in/out the following lines to obtain the desired subroutines. These
; selections MUST match those in the C header file aes.h

; %define AES_128                 ; define if AES with 128 bit keys is needed
; %define AES_192                 ; define if AES with 192 bit keys is needed
%define AES_256                 ; define if AES with 256 bit keys is needed
; %define AES_VAR                 ; define if a variable key size is needed
%define ENCRYPTION              ; define if encryption is needed
%define DECRYPTION              ; define if decryption is needed
%define AES_REV_DKS             ; define if key decryption schedule is reversed
%define LAST_ROUND_TABLES       ; define for the faster version using extra tables

; The encryption key schedule has the following in memory layout where N is the
; number of rounds (10, 12 or 14):
;
; lo: | input key (round 0)  |  ; each round is four 32-bit words
;     | encryption round 1   |
;     | encryption round 2   |
;     ....
;     | encryption round N-1 |
; hi: | encryption round N   |
;
; The decryption key schedule is normally set up so that it has the same
; layout as above by actually reversing the order of the encryption key
; schedule in memory (this happens when AES_REV_DKS is set):
;
; lo: | decryption round 0   | =              | encryption round N   |
;     | decryption round 1   | = INV_MIX_COL[ | encryption round N-1 | ]
;     | decryption round 2   | = INV_MIX_COL[ | encryption round N-2 | ]
;     ....                       ....
;     | decryption round N-1 | = INV_MIX_COL[ | encryption round 1   | ]
; hi: | decryption round N   | =              | input key (round 0)  |
;
; with rounds except the first and last modified using inv_mix_column()
; But if AES_REV_DKS is NOT set the order of keys is left as it is for
; encryption so that it has to be accessed in reverse when used for
; decryption (although the inverse mix column modifications are done)
;
; lo: | decryption round 0   | =              | input key (round 0)  |
;     | decryption round 1   | = INV_MIX_COL[ | encryption round 1   | ]
;     | decryption round 2   | = INV_MIX_COL[ | encryption round 2   | ]
;     ....                       ....
;     | decryption round N-1 | = INV_MIX_COL[ | encryption round N-1 | ]
; hi: | decryption round N   | =              | encryption round N   |
;
; This layout is faster when the assembler key scheduling provided here
; is used.
;
; The DLL interface must use the _stdcall convention in which the number
; of bytes of parameter space is added after an @ to the sutine's name.
; We must also remove our parameters from the stack before return (see
; the do_exit macro). Define DLL_EXPORT for the Dynamic Link Library version.

;%define DLL_EXPORT

; End of user defines

%ifdef AES_VAR
%ifndef AES_128
%define AES_128
%endif
%ifndef AES_192
%define AES_192
%endif
%ifndef AES_256
%define AES_256
%endif
%endif

%ifdef AES_VAR
%define KS_LENGTH       60
%elifdef AES_256
%define KS_LENGTH       60
%elifdef AES_192
%define KS_LENGTH       52
%else
%define KS_LENGTH       44
%endif

%define     r0  rax
%define     r1  rdx
%define     r2  rcx
%define     r3  rbx
%define     r4  rsi
%define     r5  rdi
%define     r6  rbp
%define     r7  rsp

%define     raxd    eax
%define     rdxd    edx
%define     rcxd    ecx
%define     rbxd    ebx
%define     rsid    esi
%define     rdid    edi
%define     rbpd    ebp
%define     rspd    esp

%define     raxb    al
%define     rdxb    dl
%define     rcxb    cl
%define     rbxb    bl
%define     rsib    sil
%define     rdib    dil
%define     rbpb    bpl
%define     rspb    spl

%define     r0h ah
%define     r1h dh
%define     r2h ch
%define     r3h bh

%define     r0d eax
%define     r1d edx
%define     r2d ecx
%define     r3d ebx

; finite field multiplies by {02}, {04} and {08}

%define f2(x)   ((x<<1)^(((x>>7)&1)*0x11b))
%define f4(x)   ((x<<2)^(((x>>6)&1)*0x11b)^(((x>>6)&2)*0x11b))
%define f8(x)   ((x<<3)^(((x>>5)&1)*0x11b)^(((x>>5)&2)*0x11b)^(((x>>5)&4)*0x11b))

; finite field multiplies required in table generation

%define f3(x)   (f2(x) ^ x)
%define f9(x)   (f8(x) ^ x)
%define fb(x)   (f8(x) ^ f2(x) ^ x)
%define fd(x)   (f8(x) ^ f4(x) ^ x)
%define fe(x)   (f8(x) ^ f4(x) ^ f2(x))

; macro for expanding S-box data

%macro enc_vals 1
    db  %1(0x63),%1(0x7c),%1(0x77),%1(0x7b),%1(0xf2),%1(0x6b),%1(0x6f),%1(0xc5)
    db  %1(0x30),%1(0x01),%1(0x67),%1(0x2b),%1(0xfe),%1(0xd7),%1(0xab),%1(0x76)
    db  %1(0xca),%1(0x82),%1(0xc9),%1(0x7d),%1(0xfa),%1(0x59),%1(0x47),%1(0xf0)
    db  %1(0xad),%1(0xd4),%1(0xa2),%1(0xaf),%1(0x9c),%1(0xa4),%1(0x72),%1(0xc0)
    db  %1(0xb7),%1(0xfd),%1(0x93),%1(0x26),%1(0x36),%1(0x3f),%1(0xf7),%1(0xcc)
    db  %1(0x34),%1(0xa5),%1(0xe5),%1(0xf1),%1(0x71),%1(0xd8),%1(0x31),%1(0x15)
    db  %1(0x04),%1(0xc7),%1(0x23),%1(0xc3),%1(0x18),%1(0x96),%1(0x05),%1(0x9a)
    db  %1(0x07),%1(0x12),%1(0x80),%1(0xe2),%1(0xeb),%1(0x27),%1(0xb2),%1(0x75)
    db  %1(0x09),%1(0x83),%1(0x2c),%1(0x1a),%1(0x1b),%1(0x6e),%1(0x5a),%1(0xa0)
    db  %1(0x52),%1(0x3b),%1(0xd6),%1(0xb3),%1(0x29),%1(0xe3),%1(0x2f),%1(0x84)
    db  %1(0x53),%1(0xd1),%1(0x00),%1(0xed),%1(0x20),%1(0xfc),%1(0xb1),%1(0x5b)
    db  %1(0x6a),%1(0xcb),%1(0xbe),%1(0x39),%1(0x4a),%1(0x4c),%1(0x58),%1(0xcf)
    db  %1(0xd0),%1(0xef),%1(0xaa),%1(0xfb),%1(0x43),%1(0x4d),%1(0x33),%1(0x85)
    db  %1(0x45),%1(0xf9),%1(0x02),%1(0x7f),%1(0x50),%1(0x3c),%1(0x9f),%1(0xa8)
    db  %1(0x51),%1(0xa3),%1(0x40),%1(0x8f),%1(0x92),%1(0x9d),%1(0x38),%1(0xf5)
    db  %1(0xbc),%1(0xb6),%1(0xda),%1(0x21),%1(0x10),%1(0xff),%1(0xf3),%1(0xd2)
    db  %1(0xcd),%1(0x0c),%1(0x13),%1(0xec),%1(0x5f),%1(0x97),%1(0x44),%1(0x17)
    db  %1(0xc4),%1(0xa7),%1(0x7e),%1(0x3d),%1(0x64),%1(0x5d),%1(0x19),%1(0x73)
    db  %1(0x60),%1(0x81),%1(0x4f),%1(0xdc),%1(0x22),%1(0x2a),%1(0x90),%1(0x88)
    db  %1(0x46),%1(0xee),%1(0xb8),%1(0x14),%1(0xde),%1(0x5e),%1(0x0b),%1(0xdb)
    db  %1(0xe0),%1(0x32),%1(0x3a),%1(0x0a),%1(0x49),%1(0x06),%1(0x24),%1(0x5c)
    db  %1(0xc2),%1(0xd3),%1(0xac),%1(0x62),%1(0x91),%1(0x95),%1(0xe4),%1(0x79)
    db  %1(0xe7),%1(0xc8),%1(0x37),%1(0x6d),%1(0x8d),%1(0xd5),%1(0x4e),%1(0xa9)
    db  %1(0x6c),%1(0x56),%1(0xf4),%1(0xea),%1(0x65),%1(0x7a),%1(0xae),%1(0x08)
    db  %1(0xba),%1(0x78),%1(0x25),%1(0x2e),%1(0x1c),%1(0xa6),%1(0xb4),%1(0xc6)
    db  %1(0xe8),%1(0xdd),%1(0x74),%1(0x1f),%1(0x4b),%1(0xbd),%1(0x8b),%1(0x8a)
    db  %1(0x70),%1(0x3e),%1(0xb5),%1(0x66),%1(0x48),%1(0x03),%1(0xf6),%1(0x0e)
    db  %1(0x61),%1(0x35),%1(0x57),%1(0xb9),%1(0x86),%1(0xc1),%1(0x1d),%1(0x9e)
    db  %1(0xe1),%1(0xf8),%1(0x98),%1(0x11),%1(0x69),%1(0xd9),%1(0x8e),%1(0x94)
    db  %1(0x9b),%1(0x1e),%1(0x87),%1(0xe9),%1(0xce),%1(0x55),%1(0x28),%1(0xdf)
    db  %1(0x8c),%1(0xa1),%1(0x89),%1(0x0d),%1(0xbf),%1(0xe6),%1(0x42),%1(0x68)
    db  %1(0x41),%1(0x99),%1(0x2d),%1(0x0f),%1(0xb0),%1(0x54),%1(0xbb),%1(0x16)
%endmacro

%macro dec_vals 1
    db  %1(0x52),%1(0x09),%1(0x6a),%1(0xd5),%1(0x30),%1(0x36),%1(0xa5),%1(0x38)
    db  %1(0xbf),%1(0x40),%1(0xa3),%1(0x9e),%1(0x81),%1(0xf3),%1(0xd7),%1(0xfb)
    db  %1(0x7c),%1(0xe3),%1(0x39),%1(0x82),%1(0x9b),%1(0x2f),%1(0xff),%1(0x87)
    db  %1(0x34),%1(0x8e),%1(0x43),%1(0x44),%1(0xc4),%1(0xde),%1(0xe9),%1(0xcb)
    db  %1(0x54),%1(0x7b),%1(0x94),%1(0x32),%1(0xa6),%1(0xc2),%1(0x23),%1(0x3d)
    db  %1(0xee),%1(0x4c),%1(0x95),%1(0x0b),%1(0x42),%1(0xfa),%1(0xc3),%1(0x4e)
    db  %1(0x08),%1(0x2e),%1(0xa1),%1(0x66),%1(0x28),%1(0xd9),%1(0x24),%1(0xb2)
    db  %1(0x76),%1(0x5b),%1(0xa2),%1(0x49),%1(0x6d),%1(0x8b),%1(0xd1),%1(0x25)
    db  %1(0x72),%1(0xf8),%1(0xf6),%1(0x64),%1(0x86),%1(0x68),%1(0x98),%1(0x16)
    db  %1(0xd4),%1(0xa4),%1(0x5c),%1(0xcc),%1(0x5d),%1(0x65),%1(0xb6),%1(0x92)
    db  %1(0x6c),%1(0x70),%1(0x48),%1(0x50),%1(0xfd),%1(0xed),%1(0xb9),%1(0xda)
    db  %1(0x5e),%1(0x15),%1(0x46),%1(0x57),%1(0xa7),%1(0x8d),%1(0x9d),%1(0x84)
    db  %1(0x90),%1(0xd8),%1(0xab),%1(0x00),%1(0x8c),%1(0xbc),%1(0xd3),%1(0x0a)
    db  %1(0xf7),%1(0xe4),%1(0x58),%1(0x05),%1(0xb8),%1(0xb3),%1(0x45),%1(0x06)
    db  %1(0xd0),%1(0x2c),%1(0x1e),%1(0x8f),%1(0xca),%1(0x3f),%1(0x0f),%1(0x02)
    db  %1(0xc1),%1(0xaf),%1(0xbd),%1(0x03),%1(0x01),%1(0x13),%1(0x8a),%1(0x6b)
    db  %1(0x3a),%1(0x91),%1(0x11),%1(0x41),%1(0x4f),%1(0x67),%1(0xdc),%1(0xea)
    db  %1(0x97),%1(0xf2),%1(0xcf),%1(0xce),%1(0xf0),%1(0xb4),%1(0xe6),%1(0x73)
    db  %1(0x96),%1(0xac),%1(0x74),%1(0x22),%1(0xe7),%1(0xad),%1(0x35),%1(0x85)
    db  %1(0xe2),%1(0xf9),%1(0x37),%1(0xe8),%1(0x1c),%1(0x75),%1(0xdf),%1(0x6e)
    db  %1(0x47),%1(0xf1),%1(0x1a),%1(0x71),%1(0x1d),%1(0x29),%1(0xc5),%1(0x89)
    db  %1(0x6f),%1(0xb7),%1(0x62),%1(0x0e),%1(0xaa),%1(0x18),%1(0xbe),%1(0x1b)
    db  %1(0xfc),%1(0x56),%1(0x3e),%1(0x4b),%1(0xc6),%1(0xd2),%1(0x79),%1(0x20)
    db  %1(0x9a),%1(0xdb),%1(0xc0),%1(0xfe),%1(0x78),%1(0xcd),%1(0x5a),%1(0xf4)
    db  %1(0x1f),%1(0xdd),%1(0xa8),%1(0x33),%1(0x88),%1(0x07),%1(0xc7),%1(0x31)
    db  %1(0xb1),%1(0x12),%1(0x10),%1(0x59),%1(0x27),%1(0x80),%1(0xec),%1(0x5f)
    db  %1(0x60),%1(0x51),%1(0x7f),%1(0xa9),%1(0x19),%1(0xb5),%1(0x4a),%1(0x0d)
    db  %1(0x2d),%1(0xe5),%1(0x7a),%1(0x9f),%1(0x93),%1(0xc9),%1(0x9c),%1(0xef)
    db  %1(0xa0),%1(0xe0),%1(0x3b),%1(0x4d),%1(0xae),%1(0x2a),%1(0xf5),%1(0xb0)
    db  %1(0xc8),%1(0xeb),%1(0xbb),%1(0x3c),%1(0x83),%1(0x53),%1(0x99),%1(0x61)
    db  %1(0x17),%1(0x2b),%1(0x04),%1(0x7e),%1(0xba),%1(0x77),%1(0xd6),%1(0x26)
    db  %1(0xe1),%1(0x69),%1(0x14),%1(0x63),%1(0x55),%1(0x21),%1(0x0c),%1(0x7d)
%endmacro

%define u8(x)   f2(x), x, x, f3(x), f2(x), x, x, f3(x)
%define v8(x)   fe(x), f9(x), fd(x), fb(x), fe(x), f9(x), fd(x), x
%define w8(x)   x, 0, 0, 0, x, 0, 0, 0

%define tptr    rbp     ; table pointer
%define kptr    r8      ; key schedule pointer
%define fofs    128     ; adjust offset in key schedule to keep |disp| < 128
%define fk_ref(x,y) [kptr-16*x+fofs+4*y]
%ifdef  AES_REV_DKS
%define rofs    128
%define ik_ref(x,y) [kptr-16*x+rofs+4*y]
%else
%define rofs    -128
%define ik_ref(x,y) [kptr+16*x+rofs+4*y]
%endif

%define tab_0(x)   [tptr+8*x]
%define tab_1(x)   [tptr+8*x+3]
%define tab_2(x)   [tptr+8*x+2]
%define tab_3(x)   [tptr+8*x+1]
%define tab_f(x)   byte [tptr+8*x+1]
%define tab_i(x)   byte [tptr+8*x+7]
%define t_ref(x,r) tab_ %+ x(r)

%macro ff_rnd 5                 ; normal forward round
    mov     %1d, fk_ref(%5,0)
    mov     %2d, fk_ref(%5,1)
    mov     %3d, fk_ref(%5,2)
    mov     %4d, fk_ref(%5,3)

    movzx   esi, al
    movzx   edi, ah
    shr     eax, 16
    xor     %1d, t_ref(0,rsi)
    xor     %4d, t_ref(1,rdi)
    movzx   esi, al
    movzx   edi, ah
    xor     %3d, t_ref(2,rsi)
    xor     %2d, t_ref(3,rdi)

    movzx   esi, bl
    movzx   edi, bh
    shr     ebx, 16
    xor     %2d, t_ref(0,rsi)
    xor     %1d, t_ref(1,rdi)
    movzx   esi, bl
    movzx   edi, bh
    xor     %4d, t_ref(2,rsi)
    xor     %3d, t_ref(3,rdi)

    movzx   esi, cl
    movzx   edi, ch
    shr     ecx, 16
    xor     %3d, t_ref(0,rsi)
    xor     %2d, t_ref(1,rdi)
    movzx   esi, cl
    movzx   edi, ch
    xor     %1d, t_ref(2,rsi)
    xor     %4d, t_ref(3,rdi)

    movzx   esi, dl
    movzx   edi, dh
    shr     edx, 16
    xor     %4d, t_ref(0,rsi)
    xor     %3d, t_ref(1,rdi)
    movzx   esi, dl
    movzx   edi, dh
    xor     %2d, t_ref(2,rsi)
    xor     %1d, t_ref(3,rdi)

    mov     eax,%1d
    mov     ebx,%2d
    mov     ecx,%3d
    mov     edx,%4d
%endmacro

%ifdef LAST_ROUND_TABLES

%macro fl_rnd 5                 ; last forward round
    add     tptr, 2048
    mov     %1d, fk_ref(%5,0)
    mov     %2d, fk_ref(%5,1)
    mov     %3d, fk_ref(%5,2)
    mov     %4d, fk_ref(%5,3)

    movzx   esi, al
    movzx   edi, ah
    shr     eax, 16
    xor     %1d, t_ref(0,rsi)
    xor     %4d, t_ref(1,rdi)
    movzx   esi, al
    movzx   edi, ah
    xor     %3d, t_ref(2,rsi)
    xor     %2d, t_ref(3,rdi)

    movzx   esi, bl
    movzx   edi, bh
    shr     ebx, 16
    xor     %2d, t_ref(0,rsi)
    xor     %1d, t_ref(1,rdi)
    movzx   esi, bl
    movzx   edi, bh
    xor     %4d, t_ref(2,rsi)
    xor     %3d, t_ref(3,rdi)

    movzx   esi, cl
    movzx   edi, ch
    shr     ecx, 16
    xor     %3d, t_ref(0,rsi)
    xor     %2d, t_ref(1,rdi)
    movzx   esi, cl
    movzx   edi, ch
    xor     %1d, t_ref(2,rsi)
    xor     %4d, t_ref(3,rdi)

    movzx   esi, dl
    movzx   edi, dh
    shr     edx, 16
    xor     %4d, t_ref(0,rsi)
    xor     %3d, t_ref(1,rdi)
    movzx   esi, dl
    movzx   edi, dh
    xor     %2d, t_ref(2,rsi)
    xor     %1d, t_ref(3,rdi)
%endmacro

%else

%macro fl_rnd 5                 ; last forward round
    mov     %1d, fk_ref(%5,0)
    mov     %2d, fk_ref(%5,1)
    mov     %3d, fk_ref(%5,2)
    mov     %4d, fk_ref(%5,3)

    movzx   esi, al
    movzx   edi, ah
    shr     eax, 16
    movzx   esi, t_ref(f,rsi)
    movzx   edi, t_ref(f,rdi)
    xor     %1d, esi
    rol     edi, 8
    xor     %4d, edi
    movzx   esi, al
    movzx   edi, ah
    movzx   esi, t_ref(f,rsi)
    movzx   edi, t_ref(f,rdi)
    rol     esi, 16
    rol     edi, 24
    xor     %3d, esi
    xor     %2d, edi

    movzx   esi, bl
    movzx   edi, bh
    shr     ebx, 16
    movzx   esi, t_ref(f,rsi)
    movzx   edi, t_ref(f,rdi)
    xor     %2d, esi
    rol     edi, 8
    xor     %1d, edi
    movzx   esi, bl
    movzx   edi, bh
    movzx   esi, t_ref(f,rsi)
    movzx   edi, t_ref(f,rdi)
    rol     esi, 16
    rol     edi, 24
    xor     %4d, esi
    xor     %3d, edi

    movzx   esi, cl
    movzx   edi, ch
    movzx   esi, t_ref(f,rsi)
    movzx   edi, t_ref(f,rdi)
    shr     ecx, 16
    xor     %3d, esi
    rol     edi, 8
    xor     %2d, edi
    movzx   esi, cl
    movzx   edi, ch
    movzx   esi, t_ref(f,rsi)
    movzx   edi, t_ref(f,rdi)
    rol     esi, 16
    rol     edi, 24
    xor     %1d, esi
    xor     %4d, edi

    movzx   esi, dl
    movzx   edi, dh
    movzx   esi, t_ref(f,rsi)
    movzx   edi, t_ref(f,rdi)
    shr     edx, 16
    xor     %4d, esi
    rol     edi, 8
    xor     %3d, edi
    movzx   esi, dl
    movzx   edi, dh
    movzx   esi, t_ref(f,rsi)
    movzx   edi, t_ref(f,rdi)
    rol     esi, 16
    rol     edi, 24
    xor     %2d, esi
    xor     %1d, edi
%endmacro

%endif

%macro ii_rnd 5                 ; normal inverse round
    mov     %1d, ik_ref(%5,0)
    mov     %2d, ik_ref(%5,1)
    mov     %3d, ik_ref(%5,2)
    mov     %4d, ik_ref(%5,3)

    movzx   esi, al
    movzx   edi, ah
    shr     eax, 16
    xor     %1d, t_ref(0,rsi)
    xor     %2d, t_ref(1,rdi)
    movzx   esi, al
    movzx   edi, ah
    xor     %3d, t_ref(2,rsi)
    xor     %4d, t_ref(3,rdi)

    movzx   esi, bl
    movzx   edi, bh
    shr     ebx, 16
    xor     %2d, t_ref(0,rsi)
    xor     %3d, t_ref(1,rdi)
    movzx   esi, bl
    movzx   edi, bh
    xor     %4d, t_ref(2,rsi)
    xor     %1d, t_ref(3,rdi)

    movzx   esi, cl
    movzx   edi, ch
    shr     ecx, 16
    xor     %3d, t_ref(0,rsi)
    xor     %4d, t_ref(1,rdi)
    movzx   esi, cl
    movzx   edi, ch
    xor     %1d, t_ref(2,rsi)
    xor     %2d, t_ref(3,rdi)

    movzx   esi, dl
    movzx   edi, dh
    shr     edx, 16
    xor     %4d, t_ref(0,rsi)
    xor     %1d, t_ref(1,rdi)
    movzx   esi, dl
    movzx   edi, dh
    xor     %2d, t_ref(2,rsi)
    xor     %3d, t_ref(3,rdi)

    mov     eax,%1d
    mov     ebx,%2d
    mov     ecx,%3d
    mov     edx,%4d
%endmacro

%ifdef LAST_ROUND_TABLES

%macro il_rnd 5                 ; last inverse round
    add     tptr, 2048
    mov     %1d, ik_ref(%5,0)
    mov     %2d, ik_ref(%5,1)
    mov     %3d, ik_ref(%5,2)
    mov     %4d, ik_ref(%5,3)

    movzx   esi, al
    movzx   edi, ah
    shr     eax, 16
    xor     %1d, t_ref(0,rsi)
    xor     %2d, t_ref(1,rdi)
    movzx   esi, al
    movzx   edi, ah
    xor     %3d, t_ref(2,rsi)
    xor     %4d, t_ref(3,rdi)

    movzx   esi, bl
    movzx   edi, bh
    shr     ebx, 16
    xor     %2d, t_ref(0,rsi)
    xor     %3d, t_ref(1,rdi)
    movzx   esi, bl
    movzx   edi, bh
    xor     %4d, t_ref(2,rsi)
    xor     %1d, t_ref(3,rdi)

    movzx   esi, cl
    movzx   edi, ch
    shr     ecx, 16
    xor     %3d, t_ref(0,rsi)
    xor     %4d, t_ref(1,rdi)
    movzx   esi, cl
    movzx   edi, ch
    xor     %1d, t_ref(2,rsi)
    xor     %2d, t_ref(3,rdi)

    movzx   esi, dl
    movzx   edi, dh
    shr     edx, 16
    xor     %4d, t_ref(0,rsi)
    xor     %1d, t_ref(1,rdi)
    movzx   esi, dl
    movzx   edi, dh
    xor     %2d, t_ref(2,rsi)
    xor     %3d, t_ref(3,rdi)
%endmacro

%else

%macro il_rnd 5                 ; last inverse round
    mov     %1d, ik_ref(%5,0)
    mov     %2d, ik_ref(%5,1)
    mov     %3d, ik_ref(%5,2)
    mov     %4d, ik_ref(%5,3)

    movzx   esi, al
    movzx   edi, ah
    movzx   esi, t_ref(i,rsi)
    movzx   edi, t_ref(i,rdi)
    shr     eax, 16
    xor     %1d, esi
    rol     edi, 8
    xor     %2d, edi
    movzx   esi, al
    movzx   edi, ah
    movzx   esi, t_ref(i,rsi)
    movzx   edi, t_ref(i,rdi)
    rol     esi, 16
    rol     edi, 24
    xor     %3d, esi
    xor     %4d, edi

    movzx   esi, bl
    movzx   edi, bh
    movzx   esi, t_ref(i,rsi)
    movzx   edi, t_ref(i,rdi)
    shr     ebx, 16
    xor     %2d, esi
    rol     edi, 8
    xor     %3d, edi
    movzx   esi, bl
    movzx   edi, bh
    movzx   esi, t_ref(i,rsi)
    movzx   edi, t_ref(i,rdi)
    rol     esi, 16
    rol     edi, 24
    xor     %4d, esi
    xor     %1d, edi

    movzx   esi, cl
    movzx   edi, ch
    movzx   esi, t_ref(i,rsi)
    movzx   edi, t_ref(i,rdi)
    shr     ecx, 16
    xor     %3d, esi
    rol     edi, 8
    xor     %4d, edi
    movzx   esi, cl
    movzx   edi, ch
    movzx   esi, t_ref(i,rsi)
    movzx   edi, t_ref(i,rdi)
    rol     esi, 16
    rol     edi, 24
    xor     %1d, esi
    xor     %2d, edi

    movzx   esi, dl
    movzx   edi, dh
    movzx   esi, t_ref(i,rsi)
    movzx   edi, t_ref(i,rdi)
    shr     edx, 16
    xor     %4d, esi
    rol     edi, 8
    xor     %1d, edi
    movzx   esi, dl
    movzx   edi, dh
    movzx   esi, t_ref(i,rsi)
    movzx   edi, t_ref(i,rdi)
    rol     esi, 16
    rol     edi, 24
    xor     %2d, esi
    xor     %3d, edi
%endmacro

%endif

%ifdef ENCRYPTION

    global  aes_encrypt
%ifdef DLL_EXPORT
    export  aes_encrypt
%endif

    section .data align=64
    align   64
enc_tab:
    enc_vals u8
%ifdef LAST_ROUND_TABLES
    enc_vals w8
%endif

    section .text align=16
    align   16

%ifdef _SEH_
proc_frame aes_encrypt
	alloc_stack	7*8			; 7 to align stack to 16 bytes
	save_reg	rsi,4*8
	save_reg	rdi,5*8
	save_reg	rbx,1*8
	save_reg	rbp,2*8
	save_reg	r12,3*8
end_prologue
    mov     rdi, rcx        ; input pointer
    mov     [rsp+0*8], rdx  ; output pointer
%else
	aes_encrypt:
	%ifdef __GNUC__
		sub     rsp, 4*8        ; gnu/linux binary interface
		mov     [rsp+0*8], rsi  ; output pointer
		mov     r8, rdx         ; context
	%else
		sub     rsp, 6*8        ; windows binary interface
		mov     [rsp+4*8], rsi
		mov     [rsp+5*8], rdi
		mov     rdi, rcx        ; input pointer
		mov     [rsp+0*8], rdx  ; output pointer
	%endif
		mov     [rsp+1*8], rbx  ; input pointer in rdi
		mov     [rsp+2*8], rbp  ; output pointer in [rsp]
		mov     [rsp+3*8], r12  ; context in r8
%endif

    movzx   esi, byte [kptr+4*KS_LENGTH]
    lea     tptr, [rel enc_tab]
    sub     kptr, fofs

    mov     eax, [rdi+0*4]
    mov     ebx, [rdi+1*4]
    mov     ecx, [rdi+2*4]
    mov     edx, [rdi+3*4]

    xor     eax, [kptr+fofs]
    xor     ebx, [kptr+fofs+4]
    xor     ecx, [kptr+fofs+8]
    xor     edx, [kptr+fofs+12]

    lea     kptr,[kptr+rsi]
    cmp     esi, 10*16
    je      .3
    cmp     esi, 12*16
    je      .2
    cmp     esi, 14*16
    je      .1
    mov     rax, -1
    jmp     .4

.1: ff_rnd  r9, r10, r11, r12, 13
    ff_rnd  r9, r10, r11, r12, 12
.2: ff_rnd  r9, r10, r11, r12, 11
    ff_rnd  r9, r10, r11, r12, 10
.3: ff_rnd  r9, r10, r11, r12, 9
    ff_rnd  r9, r10, r11, r12, 8
    ff_rnd  r9, r10, r11, r12, 7
    ff_rnd  r9, r10, r11, r12, 6
    ff_rnd  r9, r10, r11, r12, 5
    ff_rnd  r9, r10, r11, r12, 4
    ff_rnd  r9, r10, r11, r12, 3
    ff_rnd  r9, r10, r11, r12, 2
    ff_rnd  r9, r10, r11, r12, 1
    fl_rnd  r9, r10, r11, r12, 0

    mov     rbx, [rsp]
    mov     [rbx], r9d
    mov     [rbx+4], r10d
    mov     [rbx+8], r11d
    mov     [rbx+12], r12d
    xor     rax, rax
.4:
    mov     rbx, [rsp+1*8]
    mov     rbp, [rsp+2*8]
    mov     r12, [rsp+3*8]
%ifdef __GNUC__
    add     rsp, 4*8
    ret
%else
		mov     rsi, [rsp+4*8]
		mov     rdi, [rsp+5*8]
	%ifdef _SEH_
		add     rsp, 7*8
		ret
	endproc_frame
	%else
		add     rsp, 6*8
		ret
	%endif
%endif

%endif

%ifdef DECRYPTION

    global  aes_decrypt
%ifdef DLL_EXPORT
    export  aes_decrypt
%endif

    section .data
    align   64
dec_tab:
    dec_vals v8
%ifdef LAST_ROUND_TABLES
    dec_vals w8
%endif

    section .text
    align   16

%ifdef _SEH_
proc_frame aes_decrypt
	alloc_stack	7*8			; 7 to align stack to 16 bytes
	save_reg	rsi,4*8
	save_reg	rdi,5*8
	save_reg	rbx,1*8
	save_reg	rbp,2*8
	save_reg	r12,3*8
end_prologue
    mov     rdi, rcx        ; input pointer
    mov     [rsp+0*8], rdx  ; output pointer
%else
	aes_decrypt:
	%ifdef __GNUC__
		sub     rsp, 4*8        ; gnu/linux binary interface
		mov     [rsp+0*8], rsi  ; output pointer
		mov     r8, rdx         ; context
	%else
		sub     rsp, 6*8        ; windows binary interface
		mov     [rsp+4*8], rsi
		mov     [rsp+5*8], rdi
		mov     rdi, rcx        ; input pointer
		mov     [rsp+0*8], rdx  ; output pointer
	%endif
		mov     [rsp+1*8], rbx  ; input pointer in rdi
		mov     [rsp+2*8], rbp  ; output pointer in [rsp]
		mov     [rsp+3*8], r12  ; context in r8
%endif

    movzx   esi,byte[kptr+4*KS_LENGTH]
    lea     tptr, [rel dec_tab]
    sub     kptr, rofs

    mov     eax, [rdi+0*4]
    mov     ebx, [rdi+1*4]
    mov     ecx, [rdi+2*4]
    mov     edx, [rdi+3*4]

%ifdef      AES_REV_DKS
    mov     rdi, kptr
    lea     kptr,[kptr+rsi]
%else
    lea     rdi,[kptr+rsi]
%endif

    xor     eax, [rdi+rofs]
    xor     ebx, [rdi+rofs+4]
    xor     ecx, [rdi+rofs+8]
    xor     edx, [rdi+rofs+12]

    cmp     esi, 10*16
    je      .3
    cmp     esi, 12*16
    je      .2
    cmp     esi, 14*16
    je      .1
    mov     rax, -1
    jmp     .4

.1: ii_rnd  r9, r10, r11, r12, 13
    ii_rnd  r9, r10, r11, r12, 12
.2: ii_rnd  r9, r10, r11, r12, 11
    ii_rnd  r9, r10, r11, r12, 10
.3: ii_rnd  r9, r10, r11, r12, 9
    ii_rnd  r9, r10, r11, r12, 8
    ii_rnd  r9, r10, r11, r12, 7
    ii_rnd  r9, r10, r11, r12, 6
    ii_rnd  r9, r10, r11, r12, 5
    ii_rnd  r9, r10, r11, r12, 4
    ii_rnd  r9, r10, r11, r12, 3
    ii_rnd  r9, r10, r11, r12, 2
    ii_rnd  r9, r10, r11, r12, 1
    il_rnd  r9, r10, r11, r12, 0

    mov     rbx, [rsp]
    mov     [rbx], r9d
    mov     [rbx+4], r10d
    mov     [rbx+8], r11d
    mov     [rbx+12], r12d
    xor     rax, rax
.4: mov     rbx, [rsp+1*8]
    mov     rbp, [rsp+2*8]
    mov     r12, [rsp+3*8]
%ifdef __GNUC__
    add     rsp, 4*8
    ret
%else
		mov     rsi, [rsp+4*8]
		mov     rdi, [rsp+5*8]
	%ifdef _SEH_
		add     rsp, 7*8
		ret
	endproc_frame
	%else
		add     rsp, 6*8
		ret
	%endif
%endif

%endif

%ifidn __OUTPUT_FORMAT__,elf
section .note.GNU-stack noalloc noexec nowrite progbits
%endif
%ifidn __OUTPUT_FORMAT__,elf32
section .note.GNU-stack noalloc noexec nowrite progbits
%endif
%ifidn __OUTPUT_FORMAT__,elf64
section .note.GNU-stack noalloc noexec nowrite progbits
%endif