;
; Copyright (c) 2010 TrueCrypt Developers Association. All rights reserved.
;
; Governed by the TrueCrypt License 3.0 the full text of which is contained in
; the file License.txt included in TrueCrypt binary and source code distribution
; packages.
;


%ifidn __BITS__, 16
	%define R e
%elifidn __BITS__, 32
	%define R e
%elifidn __BITS__, 64
	%define R r
%endif


%macro export_function 1-2 0

	%ifdef MS_STDCALL
		global %1@%2
		export _%1@%2
	%1@%2:
	%elifidn __BITS__, 16
		global _%1
	_%1:
	%else
		global %1
	%1:
	%endif

%endmacro


%macro aes_function_entry 1

	; void (const byte *ks, byte *data);

	export_function %1, 8

	%ifidn __BITS__, 32
		mov ecx, [esp + 4 + 4 * 0]
		mov edx, [esp + 4 + 4 * 1]
	%elifidn __BITS__, 64
		%ifnidn __OUTPUT_FORMAT__, win64
			mov rcx, rdi
			mov rdx, rsi
		%endif
	%endif

	; ecx/rcx = ks
	; edx/rdx = data

%endmacro


%macro aes_function_exit 0

	; void (const byte *, byte *);

	%ifdef MS_STDCALL
		ret 8
	%else
		ret
	%endif

%endmacro


%macro push_xmm 2
	sub rsp, 16 * (%2 - %1 + 1)

	%assign stackoffset 0
	%assign regnumber %1

	%rep (%2 - %1 + 1)
		movdqu [rsp + 16 * stackoffset], xmm%[regnumber]

		%assign stackoffset stackoffset+1
		%assign regnumber regnumber+1
	%endrep
%endmacro


%macro pop_xmm 2
	%assign stackoffset 0
	%assign regnumber %1

	%rep (%2 - %1 + 1)
		movdqu xmm%[regnumber], [rsp + 16 * stackoffset]

		%assign stackoffset stackoffset+1
		%assign regnumber regnumber+1
	%endrep

	add rsp, 16 * (%2 - %1 + 1)
%endmacro


%macro aes_hw_cpu 2
	%define OPERATION %1
	%define BLOCK_COUNT %2

	; Load data blocks
	%assign block 1
	%rep BLOCK_COUNT
		movdqu xmm%[block], [%[R]dx + 16 * (block - 1)]
		%assign block block+1
	%endrep

	; Encrypt/decrypt data blocks
	%assign round 0
	%rep 15
		movdqu xmm0, [%[R]cx + 16 * round]

		%assign block 1
		%rep BLOCK_COUNT

			%if round = 0
				pxor xmm%[block], xmm0
			%else
				%if round < 14
					aes%[OPERATION] xmm%[block], xmm0
				%else
					aes%[OPERATION]last xmm%[block], xmm0
				%endif
			%endif

			%assign block block+1
		%endrep

		%assign round round+1
	%endrep

	; Store data blocks
	%assign block 1
	%rep BLOCK_COUNT
		movdqu [%[R]dx + 16 * (block - 1)], xmm%[block]
		%assign block block+1
	%endrep

	%undef OPERATION
	%undef BLOCK_COUNT
%endmacro


%macro aes_hw_cpu_32_blocks 1
	%define OPERATION_32_BLOCKS %1

	%ifidn __BITS__, 64
		%define MAX_REG_BLOCK_COUNT 15
	%else
		%define MAX_REG_BLOCK_COUNT 7
	%endif

	%ifidn __OUTPUT_FORMAT__, win64
		%if MAX_REG_BLOCK_COUNT > 5
			push_xmm 6, MAX_REG_BLOCK_COUNT
		%endif
	%endif

		mov eax, 32 / MAX_REG_BLOCK_COUNT
	.1:
		aes_hw_cpu %[OPERATION_32_BLOCKS], MAX_REG_BLOCK_COUNT

		add %[R]dx, 16 * MAX_REG_BLOCK_COUNT
		dec eax
		jnz .1

	%if (32 % MAX_REG_BLOCK_COUNT) != 0
		aes_hw_cpu %[OPERATION_32_BLOCKS], (32 % MAX_REG_BLOCK_COUNT)
	%endif

	%ifidn __OUTPUT_FORMAT__, win64
		%if MAX_REG_BLOCK_COUNT > 5
			pop_xmm 6, MAX_REG_BLOCK_COUNT
		%endif
	%endif

	%undef OPERATION_32_BLOCKS
	%undef MAX_REG_BLOCK_COUNT
%endmacro


%ifidn __BITS__, 16

	USE16
	SEGMENT _TEXT PUBLIC CLASS=CODE USE16
	SEGMENT _DATA PUBLIC CLASS=DATA USE16
	GROUP DGROUP _TEXT _DATA
	SECTION _TEXT

%else

	SECTION .text

%endif


; void aes_hw_cpu_enable_sse ();

	export_function aes_hw_cpu_enable_sse
		mov %[R]ax, cr4
		or ax, 1 << 9
		mov cr4, %[R]ax
	ret


%ifidn __BITS__, 16


; byte is_aes_hw_cpu_supported ();

	export_function is_aes_hw_cpu_supported
		mov eax, 1
		cpuid
		mov eax, ecx
		shr eax, 25
		and al, 1
	ret


; void aes_hw_cpu_decrypt (const byte *ks, byte *data);

	export_function aes_hw_cpu_decrypt
		mov ax, -16
		jmp aes_hw_cpu_encrypt_decrypt

; void aes_hw_cpu_encrypt (const byte *ks, byte *data);

	export_function aes_hw_cpu_encrypt
		mov ax, 16

	aes_hw_cpu_encrypt_decrypt:
		push bp
		mov bp, sp
		push di
		push si

		mov si, [bp + 4]			; ks
		mov di, [bp + 4 + 2]		; data

		movdqu xmm0, [si]
		movdqu xmm1, [di]

		pxor xmm1, xmm0

		mov cx, 13

	.round1_13:
		add si, ax
		movdqu xmm0, [si]

		cmp ax, 0
		jl .decrypt
		
		aesenc xmm1, xmm0
		jmp .2
	.decrypt:
		aesdec xmm1, xmm0
	.2:
		loop .round1_13

		add si, ax
		movdqu xmm0, [si]
		
		cmp ax, 0
		jl .decrypt_last

		aesenclast xmm1, xmm0
		jmp .3
	.decrypt_last:
		aesdeclast xmm1, xmm0
	.3:
		movdqu [di], xmm1

		pop si
		pop di
		pop bp
	ret


%else	; __BITS__ != 16


; byte is_aes_hw_cpu_supported ();

	export_function is_aes_hw_cpu_supported
		push %[R]bx

		mov eax, 1
		cpuid
		mov eax, ecx
		shr eax, 25
		and eax, 1

		pop %[R]bx
	ret


; void aes_hw_cpu_decrypt (const byte *ks, byte *data);

	aes_function_entry aes_hw_cpu_decrypt
		aes_hw_cpu dec, 1
	aes_function_exit


; void aes_hw_cpu_decrypt_32_blocks (const byte *ks, byte *data);

	aes_function_entry aes_hw_cpu_decrypt_32_blocks
		aes_hw_cpu_32_blocks dec
	aes_function_exit


; void aes_hw_cpu_encrypt (const byte *ks, byte *data);

	aes_function_entry aes_hw_cpu_encrypt
		aes_hw_cpu enc, 1
	aes_function_exit


; void aes_hw_cpu_encrypt_32_blocks (const byte *ks, byte *data);

	aes_function_entry aes_hw_cpu_encrypt_32_blocks
		aes_hw_cpu_32_blocks enc
	aes_function_exit


%endif	; __BITS__ != 16