VeraCrypt
aboutsummaryrefslogtreecommitdiff
path: root/src/Crypto/sha256_avx1_x64.asm
blob: 5c4ce5593539dcd884bf21b6473f7fa5b67af08f (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
-----BEGIN CERTIFICATE-----
MIIEnDCCA4SgAwIBAgIQR5dNeHOlvKsNL7NwGS/OXjANBgkqhkiG9w0BAQUFADCB
qTELMAkGA1UEBhMCVVMxFTATBgNVBAoTDHRoYXd0ZSwgSW5jLjEoMCYGA1UECxMf
Q2VydGlmaWNhdGlvbiBTZXJ2aWNlcyBEaXZpc2lvbjE4MDYGA1UECxMvKGMpIDIw
MDYgdGhhd3RlLCBJbmMuIC0gRm9yIGF1dGhvcml6ZWQgdXNlIG9ubHkxHzAdBgNV
BAMTFnRoYXd0ZSBQcmltYXJ5IFJvb3QgQ0EwHhcNMTAwMjA4MDAwMDAwWhcNMjAw
MjA3MjM1OTU5WjBKMQswCQYDVQQGEwJVUzEVMBMGA1UEChMMVGhhd3RlLCBJbmMu
MSQwIgYDVQQDExtUaGF3dGUgQ29kZSBTaWduaW5nIENBIC0gRzIwggEiMA0GCSqG
SIb3DQEBAQUAA4IBDwAwggEKAoIBAQC3i891W58l2n45sJPbONOpI9CC+ukkflwL
joP45npZ5qPFmKeZ0kT/AKalOQSK2imI6tui8xyZFSbCsfT84QxHqQkRBgogkrnH
oASMXJQZq1slLB1ifnANzmFs3SuCyc5dSF/3wr68QSMeTyld10+89MUq/GPmfCZO
mad5QZ4QSnp5ycaG94aV0ibOPBgq1nzOr82tu/eCLHAmN0XlD0cixgEovS6DXGqk
R8Hn0NhrgUY/IRf1B8VDWqZnLLh7YBG1g+71dApycUQ9WP7oGqs4w1nbf244fXbH
cmmYNpZX02Yc0lSRBC5UGbDcPbUiXobVKn4g313merFl/sUCTjEtAgMBAAGjggEc
MIIBGDASBgNVHRMBAf8ECDAGAQH/AgEAMDQGA1UdHwQtMCswKaAnoCWGI2h0dHA6
Ly9jcmwudGhhd3RlLmNvbS9UaGF3dGVQQ0EuY3JsMA4GA1UdDwEB/wQEAwIBBjAy
BggrBgEFBQcBAQQmMCQwIgYIKwYBBQUHMAGGFmh0dHA6Ly9vY3NwLnRoYXd0ZS5j
b20wHQYDVR0lBBYwFAYIKwYBBQUHAwIGCCsGAQUFBwMDMCkGA1UdEQQiMCCkHjAc
MRowGAYDVQQDExFWZXJpU2lnbk1QS0ktMi0xMDAdBgNVHQ4EFgQU1A1lP3q9NMb+
R+dMDcC98t4Vq3EwHwYDVR0jBBgwFoAUe1tFz6/Oy3r9MZIaarbzRutXSFAwDQYJ
KoZIhvcNAQEFBQADggEBAFb+U1zhx568p+1+U21qFEtRjEBegF+qpOgv7zjIBMnK
Ps/fOlhOsNS2Y8UpV/oCBZpFTWjbKhvUND2fAMNay5VJpW7hsMX8QU1BSm/Td8jX
OI3kGd4Y8x8VZYNtRQxT+QqaLqVdv28ygRiSGWpVAK1jHFIGflXZKWiuSnwYmnmI
ayMj2Cc4KimHdsr7x7ZiIx/telZM3ZwyW/U9DEYYlTsqI2iDZEHZAG0PGSQVaHK9
xXFnbqxM25DrUaUaYgfQvmoARzxyL+xPYT5zhc5aCre6wBwTdeMiOSjdbR0JRp1P
uuhAgZHGpM6UchsBzypuFWeVia59t7fN+Qo9dbZrPCU=
-----END CERTIFICATE-----
>236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; Copyright (c) 2012, Intel Corporation 
; 
; All rights reserved. 
; 
; Redistribution and use in source and binary forms, with or without
; modification, are permitted provided that the following conditions are
; met: 
; 
; * Redistributions of source code must retain the above copyright
;   notice, this list of conditions and the following disclaimer.  
; 
; * Redistributions in binary form must reproduce the above copyright
;   notice, this list of conditions and the following disclaimer in the
;   documentation and/or other materials provided with the
;   distribution. 
; 
; * Neither the name of the Intel Corporation nor the names of its
;   contributors may be used to endorse or promote products derived from
;   this software without specific prior written permission. 
; 
; 
; THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION "AS IS" AND ANY
; EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
; PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR
; CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
; EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
; PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
; PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
; LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;
; Example YASM command lines:
; Windows:  yasm -Xvc -f x64 -rnasm -pnasm -o sha256_avx1.obj -g cv8 sha256_avx1.asm
; Linux:    yasm -f x64 -f elf64 -X gnu -g dwarf2 -D LINUX -o sha256_avx1.o sha256_avx1.asm
;
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;
; This code is described in an Intel White-Paper:
; "Fast SHA-256 Implementations on Intel Architecture Processors"
;
; To find it, surf to http://www.intel.com/p/en_US/embedded 
; and search for that title.
; The paper is expected to be released roughly at the end of April, 2012
;
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; This code schedules 1 blocks at a time, with 4 lanes per block
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

%define	VMOVDQ vmovdqu ;; assume buffers not aligned 

;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; Define Macros

; addm [mem], reg
; Add reg to mem using reg-mem add and store
%macro addm 2
	add	%2, %1
	mov	%1, %2
%endm

%macro MY_ROR 2
	shld	%1,%1,(32-(%2))
%endm

;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

; COPY_XMM_AND_BSWAP xmm, [mem], byte_flip_mask
; Load xmm with mem and byte swap each dword
%macro COPY_XMM_AND_BSWAP 3
	VMOVDQ %1, %2
	vpshufb %1, %1, %3
%endmacro

;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

%define X0 xmm4
%define X1 xmm5
%define X2 xmm6
%define X3 xmm7

%define XTMP0 xmm0
%define XTMP1 xmm1
%define XTMP2 xmm2
%define XTMP3 xmm3
%define XTMP4 xmm8
%define XFER  xmm9
%define XTMP5 xmm11

%define SHUF_00BA	xmm10 ; shuffle xBxA -> 00BA
%define SHUF_DC00	xmm12 ; shuffle xDxC -> DC00
%define BYTE_FLIP_MASK	xmm13
	
%ifndef WINABI
%define NUM_BLKS rdx	; 3rd arg
%define CTX	rsi	; 2nd arg
%define INP	rdi	; 1st arg

%define SRND	rdi	; clobbers INP
%define c	ecx
%define d 	r8d
%define e 	edx
%else
%define NUM_BLKS r8	; 3rd arg
%define CTX	rdx 	; 2nd arg
%define INP	rcx 	; 1st arg

%define SRND	rcx	; clobbers INP
%define c 	edi 
%define d	esi 
%define e 	r8d
	
%endif
%define TBL	rbp
%define a eax
%define b ebx

%define f r9d
%define g r10d
%define h r11d

%define y0 r13d
%define y1 r14d
%define y2 r15d


_INP_END_SIZE	equ 8
_INP_SIZE	equ 8
_XFER_SIZE	equ 8
%ifndef WINABI
_XMM_SAVE_SIZE	equ 0
%else
_XMM_SAVE_SIZE	equ 8*16
%endif
; STACK_SIZE plus pushes must be an odd multiple of 8
_ALIGN_SIZE	equ 8

_INP_END	equ 0
_INP		equ _INP_END  + _INP_END_SIZE
_XFER		equ _INP      + _INP_SIZE
_XMM_SAVE	equ _XFER     + _XFER_SIZE + _ALIGN_SIZE
STACK_SIZE	equ _XMM_SAVE + _XMM_SAVE_SIZE

; rotate_Xs
; Rotate values of symbols X0...X3
%macro rotate_Xs 0
%xdefine X_ X0
%xdefine X0 X1
%xdefine X1 X2
%xdefine X2 X3
%xdefine X3 X_
%endm

; ROTATE_ARGS
; Rotate values of symbols a...h
%macro ROTATE_ARGS 0
%xdefine TMP_ h
%xdefine h g
%xdefine g f
%xdefine f e
%xdefine e d
%xdefine d c
%xdefine c b
%xdefine b a
%xdefine a TMP_
%endm

%macro FOUR_ROUNDS_AND_SCHED 0
		;; compute s0 four at a time and s1 two at a time
		;; compute W[-16] + W[-7] 4 at a time
		;vmovdqa	XTMP0, X3
	mov	y0, e		; y0 = e
	MY_ROR	y0, (25-11)	; y0 = e >> (25-11)
	mov	y1, a		; y1 = a
		vpalignr	XTMP0, X3, X2, 4	; XTMP0 = W[-7]
	MY_ROR	y1, (22-13)	; y1 = a >> (22-13)
	xor	y0, e		; y0 = e ^ (e >> (25-11))
	mov	y2, f		; y2 = f
	MY_ROR	y0, (11-6)	; y0 = (e >> (11-6)) ^ (e >> (25-6))
		;vmovdqa	XTMP1, X1
	xor	y1, a		; y1 = a ^ (a >> (22-13)
	xor	y2, g		; y2 = f^g
		vpaddd	XTMP0, XTMP0, X0	; XTMP0 = W[-7] + W[-16]
	xor	y0, e		; y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
	and	y2, e		; y2 = (f^g)&e
	MY_ROR	y1, (13-2)	; y1 = (a >> (13-2)) ^ (a >> (22-2))
		;; compute s0
		vpalignr	XTMP1, X1, X0, 4	; XTMP1 = W[-15]
	xor	y1, a		; y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
	MY_ROR	y0, 6		; y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
	xor	y2, g		; y2 = CH = ((f^g)&e)^g
	
		
	MY_ROR	y1, 2		; y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
	add	y2, y0		; y2 = S1 + CH
	add	y2, [rsp + _XFER + 0*4]	; y2 = k + w + S1 + CH

	mov	y0, a		; y0 = a
	add	h, y2		; h = h + S1 + CH + k + w
	mov	y2, a		; y2 = a
	
		vpsrld	XTMP2, XTMP1, 7
		
	or	y0, c		; y0 = a|c
	add	d, h		; d = d + h + S1 + CH + k + w
	and	y2, c		; y2 = a&c
	
		vpslld	XTMP3, XTMP1, (32-7)
		
	and	y0, b		; y0 = (a|c)&b
	add	h, y1		; h = h + S1 + CH + k + w + S0
	
		vpor	XTMP3, XTMP3, XTMP2	; XTMP1 = W[-15] MY_ROR 7
		
	or	y0, y2		; y0 = MAJ = (a|c)&b)|(a&c)
	add	h, y0		; h = h + S1 + CH + k + w + S0 + MAJ

ROTATE_ARGS

	mov	y0, e		; y0 = e
	mov	y1, a		; y1 = a


	MY_ROR	y0, (25-11)	; y0 = e >> (25-11)
	xor	y0, e		; y0 = e ^ (e >> (25-11))
	mov	y2, f		; y2 = f
	MY_ROR	y1, (22-13)	; y1 = a >> (22-13)

		vpsrld	XTMP2, XTMP1,18

	xor	y1, a		; y1 = a ^ (a >> (22-13)
	MY_ROR	y0, (11-6)	; y0 = (e >> (11-6)) ^ (e >> (25-6))
	xor	y2, g		; y2 = f^g
	
		vpsrld	XTMP4, XTMP1, 3	; XTMP4 = W[-15] >> 3
		
	MY_ROR	y1, (13-2)	; y1 = (a >> (13-2)) ^ (a >> (22-2))
	xor	y0, e		; y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
	and	y2, e		; y2 = (f^g)&e
	MY_ROR	y0, 6		; y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
		
		vpslld	XTMP1, XTMP1, (32-18)

	xor	y1, a		; y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
	xor	y2, g		; y2 = CH = ((f^g)&e)^g

		vpxor	XTMP3, XTMP3, XTMP1

	add	y2, y0		; y2 = S1 + CH
	add	y2, [rsp + _XFER + 1*4]	; y2 = k + w + S1 + CH
	MY_ROR	y1, 2		; y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
	
		vpxor	XTMP3, XTMP3, XTMP2	; XTMP1 = W[-15] MY_ROR 7 ^ W[-15] MY_ROR 18
		
	mov	y0, a		; y0 = a
	add	h, y2		; h = h + S1 + CH + k + w
	mov	y2, a		; y2 = a
	
		vpxor	XTMP1, XTMP3, XTMP4	; XTMP1 = s0
		
	or	y0, c		; y0 = a|c
	add	d, h		; d = d + h + S1 + CH + k + w
	and	y2, c		; y2 = a&c
		;; compute low s1
		vpshufd	XTMP2, X3, 11111010b	; XTMP2 = W[-2] {BBAA}
	and	y0, b		; y0 = (a|c)&b
	add	h, y1		; h = h + S1 + CH + k + w + S0
		vpaddd	XTMP0, XTMP0, XTMP1	; XTMP0 = W[-16] + W[-7] + s0
	or	y0, y2		; y0 = MAJ = (a|c)&b)|(a&c)
	add	h, y0		; h = h + S1 + CH + k + w + S0 + MAJ

ROTATE_ARGS
		;vmovdqa	XTMP3, XTMP2	; XTMP3 = W[-2] {BBAA}
		
	mov	y0, e		; y0 = e
	mov	y1, a		; y1 = a
	MY_ROR	y0, (25-11)	; y0 = e >> (25-11)
	
		;vmovdqa	XTMP4, XTMP2	; XTMP4 = W[-2] {BBAA}
		
	xor	y0, e		; y0 = e ^ (e >> (25-11))
	MY_ROR	y1, (22-13)	; y1 = a >> (22-13)
	mov	y2, f		; y2 = f
	xor	y1, a		; y1 = a ^ (a >> (22-13)
	MY_ROR	y0, (11-6)	; y0 = (e >> (11-6)) ^ (e >> (25-6))
	
		vpsrld	XTMP4, XTMP2, 10	; XTMP4 = W[-2] >> 10 {BBAA}
		
	xor	y2, g		; y2 = f^g
	
		vpsrlq	XTMP3, XTMP2, 19	; XTMP3 = W[-2] MY_ROR 19 {xBxA}
		
	xor	y0, e		; y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
	and	y2, e		; y2 = (f^g)&e
	
		vpsrlq	XTMP2, XTMP2, 17	; XTMP2 = W[-2] MY_ROR 17 {xBxA}
		
	MY_ROR	y1, (13-2)	; y1 = (a >> (13-2)) ^ (a >> (22-2))
	xor	y1, a		; y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
	xor	y2, g		; y2 = CH = ((f^g)&e)^g
	MY_ROR	y0, 6		; y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
		vpxor	XTMP2, XTMP2, XTMP3
	add	y2, y0		; y2 = S1 + CH
	MY_ROR	y1, 2		; y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
	add	y2, [rsp + _XFER + 2*4]	; y2 = k + w + S1 + CH
		vpxor	XTMP4, XTMP4, XTMP2	; XTMP4 = s1 {xBxA}
	mov	y0, a		; y0 = a
	add	h, y2		; h = h + S1 + CH + k + w
	mov	y2, a		; y2 = a
		vpshufb	XTMP4, XTMP4, SHUF_00BA	; XTMP4 = s1 {00BA}
	or	y0, c		; y0 = a|c
	add	d, h		; d = d + h + S1 + CH + k + w
	and	y2, c		; y2 = a&c
		vpaddd	XTMP0, XTMP0, XTMP4	; XTMP0 = {..., ..., W[1], W[0]}
	and	y0, b		; y0 = (a|c)&b
	add	h, y1		; h = h + S1 + CH + k + w + S0
		;; compute high s1
		vpshufd	XTMP2, XTMP0, 01010000b	; XTMP2 = W[-2] {DDCC}
	or	y0, y2		; y0 = MAJ = (a|c)&b)|(a&c)
	add	h, y0		; h = h + S1 + CH + k + w + S0 + MAJ

ROTATE_ARGS
		;vmovdqa	XTMP3, XTMP2	; XTMP3 = W[-2] {DDCC}
	mov	y0, e		; y0 = e
	MY_ROR	y0, (25-11)	; y0 = e >> (25-11)
	mov	y1, a		; y1 = a
		;vmovdqa	XTMP5,    XTMP2	; XTMP5    = W[-2] {DDCC}
	MY_ROR	y1, (22-13)	; y1 = a >> (22-13)
	xor	y0, e		; y0 = e ^ (e >> (25-11))
	mov	y2, f		; y2 = f
	MY_ROR	y0, (11-6)	; y0 = (e >> (11-6)) ^ (e >> (25-6))
	
		vpsrld	XTMP5, XTMP2,   10	; XTMP5 = W[-2] >> 10 {DDCC}
		
	xor	y1, a		; y1 = a ^ (a >> (22-13)
	xor	y2, g		; y2 = f^g
	
		vpsrlq	XTMP3, XTMP2, 19	; XTMP3 = W[-2] MY_ROR 19 {xDxC}
		
	xor	y0, e		; y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
	and	y2, e		; y2 = (f^g)&e
	MY_ROR	y1, (13-2)	; y1 = (a >> (13-2)) ^ (a >> (22-2))
	
		vpsrlq	XTMP2, XTMP2, 17	; XTMP2 = W[-2] MY_ROR 17 {xDxC}
		
	xor	y1, a		; y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
	MY_ROR	y0, 6		; y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
	xor	y2, g		; y2 = CH = ((f^g)&e)^g
	
		vpxor	XTMP2, XTMP2, XTMP3
		
	MY_ROR	y1, 2		; y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
	add	y2, y0		; y2 = S1 + CH
	add	y2, [rsp + _XFER + 3*4]	; y2 = k + w + S1 + CH
		vpxor	XTMP5, XTMP5, XTMP2	; XTMP5 = s1 {xDxC}
	mov	y0, a		; y0 = a
	add	h, y2		; h = h + S1 + CH + k + w
	mov	y2, a		; y2 = a
		vpshufb	XTMP5, XTMP5, SHUF_DC00	; XTMP5 = s1 {DC00}
	or	y0, c		; y0 = a|c
	add	d, h		; d = d + h + S1 + CH + k + w
	and	y2, c		; y2 = a&c
		vpaddd	X0, XTMP5, XTMP0	; X0 = {W[3], W[2], W[1], W[0]}
	and	y0, b		; y0 = (a|c)&b
	add	h, y1		; h = h + S1 + CH + k + w + S0
	or	y0, y2		; y0 = MAJ = (a|c)&b)|(a&c)
	add	h, y0		; h = h + S1 + CH + k + w + S0 + MAJ
	
ROTATE_ARGS
rotate_Xs
%endm

;; input is [rsp + _XFER + %1 * 4]
%macro DO_ROUND 1
	mov	y0, e		; y0 = e
	MY_ROR	y0, (25-11)	; y0 = e >> (25-11)
	mov	y1, a		; y1 = a
	xor	y0, e		; y0 = e ^ (e >> (25-11))
	MY_ROR	y1, (22-13)	; y1 = a >> (22-13)
	mov	y2, f		; y2 = f
	xor	y1, a		; y1 = a ^ (a >> (22-13)
	MY_ROR	y0, (11-6)	; y0 = (e >> (11-6)) ^ (e >> (25-6))
	xor	y2, g		; y2 = f^g
	xor	y0, e		; y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
	MY_ROR	y1, (13-2)	; y1 = (a >> (13-2)) ^ (a >> (22-2))
	and	y2, e		; y2 = (f^g)&e
	xor	y1, a		; y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
	MY_ROR	y0, 6		; y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
	xor	y2, g		; y2 = CH = ((f^g)&e)^g
	add	y2, y0		; y2 = S1 + CH
	MY_ROR	y1, 2		; y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
	add	y2, [rsp + _XFER + %1 * 4]	; y2 = k + w + S1 + CH
	mov	y0, a		; y0 = a
	add	h, y2		; h = h + S1 + CH + k + w
	mov	y2, a		; y2 = a
	or	y0, c		; y0 = a|c
	add	d, h		; d = d + h + S1 + CH + k + w
	and	y2, c		; y2 = a&c
	and	y0, b		; y0 = (a|c)&b
	add	h, y1		; h = h + S1 + CH + k + w + S0
	or	y0, y2		; y0 = MAJ = (a|c)&b)|(a&c)
	add	h, y0		; h = h + S1 + CH + k + w + S0 + MAJ
	ROTATE_ARGS
%endm

;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; void sha256_avx(void *input_data, UINT32 digest[8], UINT64 num_blks)
;; arg 1 : pointer to input data
;; arg 2 : pointer to digest
;; arg 3 : Num blocks
section .text
global sha256_avx
align 32
sha256_avx:
	push	rbx
%ifdef WINABI
	push	rsi
	push	rdi
%endif
	push	rbp
	push	r13
	push	r14
	push	r15

	sub	rsp,STACK_SIZE
%ifdef WINABI
	vmovdqa	[rsp + _XMM_SAVE + 0*16],xmm6	
	vmovdqa	[rsp + _XMM_SAVE + 1*16],xmm7
	vmovdqa	[rsp + _XMM_SAVE + 2*16],xmm8	
	vmovdqa	[rsp + _XMM_SAVE + 3*16],xmm9	
	vmovdqa	[rsp + _XMM_SAVE + 4*16],xmm10
	vmovdqa	[rsp + _XMM_SAVE + 5*16],xmm11
	vmovdqa	[rsp + _XMM_SAVE + 6*16],xmm12
	vmovdqa	[rsp + _XMM_SAVE + 7*16],xmm13
%endif

	shl	NUM_BLKS, 6	; convert to bytes
	jz	done_hash
	add	NUM_BLKS, INP	; pointer to end of data
	mov	[rsp + _INP_END], NUM_BLKS

	;; load initial digest
	mov	a,[4*0 + CTX]
	mov	b,[4*1 + CTX]
	mov	c,[4*2 + CTX]
	mov	d,[4*3 + CTX]
	mov	e,[4*4 + CTX]
	mov	f,[4*5 + CTX]
	mov	g,[4*6 + CTX]
	mov	h,[4*7 + CTX]

	vmovdqa	BYTE_FLIP_MASK, [PSHUFFLE_BYTE_FLIP_MASK wrt rip]
	vmovdqa	SHUF_00BA, [_SHUF_00BA wrt rip]
	vmovdqa	SHUF_DC00, [_SHUF_DC00 wrt rip]

loop0:
	lea	TBL,[K256 wrt rip]

	;; byte swap first 16 dwords
	COPY_XMM_AND_BSWAP	X0, [INP + 0*16], BYTE_FLIP_MASK
	COPY_XMM_AND_BSWAP	X1, [INP + 1*16], BYTE_FLIP_MASK
	COPY_XMM_AND_BSWAP	X2, [INP + 2*16], BYTE_FLIP_MASK
	COPY_XMM_AND_BSWAP	X3, [INP + 3*16], BYTE_FLIP_MASK
	
	mov	[rsp + _INP], INP

	;; schedule 48 input dwords, by doing 3 rounds of 16 each
	mov	SRND, 3
align 16
loop1:
	vpaddd	XFER, X0, [TBL + 0*16]
	vmovdqa	[rsp + _XFER], XFER
	FOUR_ROUNDS_AND_SCHED

	vpaddd	XFER, X0, [TBL + 1*16]
	vmovdqa	[rsp + _XFER], XFER
	FOUR_ROUNDS_AND_SCHED

	vpaddd	XFER, X0, [TBL + 2*16]
	vmovdqa	[rsp + _XFER], XFER
	FOUR_ROUNDS_AND_SCHED

	vpaddd	XFER, X0, [TBL + 3*16]
	vmovdqa	[rsp + _XFER], XFER
	add	TBL, 4*16
	FOUR_ROUNDS_AND_SCHED

	sub	SRND, 1
	jne	loop1

	mov	SRND, 2
loop2:
	vpaddd	XFER, X0, [TBL + 0*16]
	vmovdqa	[rsp + _XFER], XFER
	DO_ROUND	0
	DO_ROUND	1
	DO_ROUND	2
	DO_ROUND	3

	vpaddd	XFER, X1, [TBL + 1*16]
	vmovdqa	[rsp + _XFER], XFER
	add	TBL, 2*16
	DO_ROUND	0
	DO_ROUND	1
	DO_ROUND	2
	DO_ROUND	3

	vmovdqa	X0, X2
	vmovdqa	X1, X3

	sub	SRND, 1
	jne	loop2


	addm	[4*0 + CTX],a
	addm	[4*1 + CTX],b
	addm	[4*2 + CTX],c
	addm	[4*3 + CTX],d
	addm	[4*4 + CTX],e
	addm	[4*5 + CTX],f
	addm	[4*6 + CTX],g
	addm	[4*7 + CTX],h

	mov	INP, [rsp + _INP]
	add	INP, 64
	cmp	INP, [rsp + _INP_END]
	jne	loop0

done_hash:
%ifdef WINABI
	vmovdqa	xmm6,[rsp + _XMM_SAVE + 0*16]
	vmovdqa	xmm7,[rsp + _XMM_SAVE + 1*16]
	vmovdqa	xmm8,[rsp + _XMM_SAVE + 2*16]
	vmovdqa	xmm9,[rsp + _XMM_SAVE + 3*16]
	vmovdqa	xmm10,[rsp + _XMM_SAVE + 4*16]
	vmovdqa	xmm11,[rsp + _XMM_SAVE + 5*16]
	vmovdqa	xmm12,[rsp + _XMM_SAVE + 6*16]
	vmovdqa	xmm13,[rsp + _XMM_SAVE + 7*16]
%endif


	add	rsp, STACK_SIZE

	pop	r15
	pop	r14
	pop	r13
	pop	rbp
%ifdef WINABI
	pop	rdi
	pop	rsi
%endif
	pop	rbx

	ret	
	

section .data
align 64
K256:
	dd	0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
	dd	0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
	dd	0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
	dd	0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
	dd	0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
	dd	0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
	dd	0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
	dd	0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
	dd	0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
	dd	0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
	dd	0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
	dd	0xd192e819,0xd6990624,0xf40e3585,0x106aa070
	dd	0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
	dd	0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
	dd	0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
	dd	0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2

PSHUFFLE_BYTE_FLIP_MASK: ddq 0x0c0d0e0f08090a0b0405060700010203

; shuffle xBxA -> 00BA
_SHUF_00BA:              ddq 0xFFFFFFFFFFFFFFFF0b0a090803020100

; shuffle xDxC -> DC00
_SHUF_DC00:              ddq 0x0b0a090803020100FFFFFFFFFFFFFFFF

%ifidn __OUTPUT_FORMAT__,elf
section .note.GNU-stack noalloc noexec nowrite progbits
%endif
%ifidn __OUTPUT_FORMAT__,elf32
section .note.GNU-stack noalloc noexec nowrite progbits
%endif
%ifidn __OUTPUT_FORMAT__,elf64
section .note.GNU-stack noalloc noexec nowrite progbits
%endif