diff options
Diffstat (limited to 'src/Crypto/Aes_hw_armv8.c')
-rw-r--r-- | src/Crypto/Aes_hw_armv8.c | 316 |
1 files changed, 316 insertions, 0 deletions
diff --git a/src/Crypto/Aes_hw_armv8.c b/src/Crypto/Aes_hw_armv8.c new file mode 100644 index 00000000..b67ed1a5 --- /dev/null +++ b/src/Crypto/Aes_hw_armv8.c @@ -0,0 +1,316 @@ +/* +* AES using ARMv8 +* Contributed by Jeffrey Walton +* +* Further changes +* (C) 2017,2018 Jack Lloyd +* +* Botan is released under the Simplified BSD License (see license.txt) +*/ + +/* Modified and adapted for VeraCrypt */ + +#include "Common/Tcdefs.h" +#include "Aes_hw_cpu.h" +#if !defined(_UEFI) +#include <memory.h> +#include <stdlib.h> +#endif +#include "cpu.h" +#include "misc.h" + +#if CRYPTOPP_ARM_AES_AVAILABLE + +#include <arm_neon.h> + +// Single block encryption operations +VC_INLINE void aes_enc_block(uint8x16_t* B, uint8x16_t K) +{ + *B = vaesmcq_u8(vaeseq_u8(*B, K)); +} + +VC_INLINE void aes_enc_block_last(uint8x16_t* B, uint8x16_t K, uint8x16_t K2) +{ + *B = veorq_u8(vaeseq_u8(*B, K), K2); +} + +// 4-block parallel encryption operations +VC_INLINE void aes_enc_4_blocks(uint8x16_t* B0, uint8x16_t* B1, + uint8x16_t* B2, uint8x16_t* B3, uint8x16_t K) +{ + *B0 = vaesmcq_u8(vaeseq_u8(*B0, K)); + *B1 = vaesmcq_u8(vaeseq_u8(*B1, K)); + *B2 = vaesmcq_u8(vaeseq_u8(*B2, K)); + *B3 = vaesmcq_u8(vaeseq_u8(*B3, K)); +} + +VC_INLINE void aes_enc_4_blocks_last(uint8x16_t* B0, uint8x16_t* B1, + uint8x16_t* B2, uint8x16_t* B3, + uint8x16_t K, uint8x16_t K2) +{ + *B0 = veorq_u8(vaeseq_u8(*B0, K), K2); + *B1 = veorq_u8(vaeseq_u8(*B1, K), K2); + *B2 = veorq_u8(vaeseq_u8(*B2, K), K2); + *B3 = veorq_u8(vaeseq_u8(*B3, K), K2); +} + +// Single block decryption operations +VC_INLINE void aes_dec_block(uint8x16_t* B, uint8x16_t K) +{ + *B = vaesimcq_u8(vaesdq_u8(*B, K)); +} + +VC_INLINE void aes_dec_block_last(uint8x16_t* B, uint8x16_t K, uint8x16_t K2) +{ + *B = veorq_u8(vaesdq_u8(*B, K), K2); +} + +// 4-block parallel decryption operations +VC_INLINE void aes_dec_4_blocks(uint8x16_t* B0, uint8x16_t* B1, + uint8x16_t* B2, uint8x16_t* B3, uint8x16_t K) +{ + *B0 = vaesimcq_u8(vaesdq_u8(*B0, K)); + *B1 = vaesimcq_u8(vaesdq_u8(*B1, K)); + *B2 = vaesimcq_u8(vaesdq_u8(*B2, K)); + *B3 = vaesimcq_u8(vaesdq_u8(*B3, K)); +} + +VC_INLINE void aes_dec_4_blocks_last(uint8x16_t* B0, uint8x16_t* B1, + uint8x16_t* B2, uint8x16_t* B3, + uint8x16_t K, uint8x16_t K2) +{ + *B0 = veorq_u8(vaesdq_u8(*B0, K), K2); + *B1 = veorq_u8(vaesdq_u8(*B1, K), K2); + *B2 = veorq_u8(vaesdq_u8(*B2, K), K2); + *B3 = veorq_u8(vaesdq_u8(*B3, K), K2); +} + +VC_INLINE void aes256_hw_encrypt_blocks(uint8 buffer[], size_t blocks, const uint8* ks) +{ + const uint8x16_t K0 = vld1q_u8(ks + 0 * 16); + const uint8x16_t K1 = vld1q_u8(ks + 1 * 16); + const uint8x16_t K2 = vld1q_u8(ks + 2 * 16); + const uint8x16_t K3 = vld1q_u8(ks + 3 * 16); + const uint8x16_t K4 = vld1q_u8(ks + 4 * 16); + const uint8x16_t K5 = vld1q_u8(ks + 5 * 16); + const uint8x16_t K6 = vld1q_u8(ks + 6 * 16); + const uint8x16_t K7 = vld1q_u8(ks + 7 * 16); + const uint8x16_t K8 = vld1q_u8(ks + 8 * 16); + const uint8x16_t K9 = vld1q_u8(ks + 9 * 16); + const uint8x16_t K10 = vld1q_u8(ks + 10 * 16); + const uint8x16_t K11 = vld1q_u8(ks + 11 * 16); + const uint8x16_t K12 = vld1q_u8(ks + 12 * 16); + const uint8x16_t K13 = vld1q_u8(ks + 13 * 16); + const uint8x16_t K14 = vld1q_u8(ks + 14 * 16); + + while(blocks >= 4) { + uint8x16_t B0 = vld1q_u8(buffer); + uint8x16_t B1 = vld1q_u8(buffer + 16); + uint8x16_t B2 = vld1q_u8(buffer + 32); + uint8x16_t B3 = vld1q_u8(buffer + 48); + + aes_enc_4_blocks(&B0, &B1, &B2, &B3, K0); + aes_enc_4_blocks(&B0, &B1, &B2, &B3, K1); + aes_enc_4_blocks(&B0, &B1, &B2, &B3, K2); + aes_enc_4_blocks(&B0, &B1, &B2, &B3, K3); + aes_enc_4_blocks(&B0, &B1, &B2, &B3, K4); + aes_enc_4_blocks(&B0, &B1, &B2, &B3, K5); + aes_enc_4_blocks(&B0, &B1, &B2, &B3, K6); + aes_enc_4_blocks(&B0, &B1, &B2, &B3, K7); + aes_enc_4_blocks(&B0, &B1, &B2, &B3, K8); + aes_enc_4_blocks(&B0, &B1, &B2, &B3, K9); + aes_enc_4_blocks(&B0, &B1, &B2, &B3, K10); + aes_enc_4_blocks(&B0, &B1, &B2, &B3, K11); + aes_enc_4_blocks(&B0, &B1, &B2, &B3, K12); + aes_enc_4_blocks_last(&B0, &B1, &B2, &B3, K13, K14); + + vst1q_u8(buffer, B0); + vst1q_u8(buffer + 16, B1); + vst1q_u8(buffer + 32, B2); + vst1q_u8(buffer + 48, B3); + + buffer += 16 * 4; + blocks -= 4; + } + + for(size_t i = 0; i != blocks; ++i) { + uint8x16_t B = vld1q_u8(buffer + 16 * i); + aes_enc_block(&B, K0); + aes_enc_block(&B, K1); + aes_enc_block(&B, K2); + aes_enc_block(&B, K3); + aes_enc_block(&B, K4); + aes_enc_block(&B, K5); + aes_enc_block(&B, K6); + aes_enc_block(&B, K7); + aes_enc_block(&B, K8); + aes_enc_block(&B, K9); + aes_enc_block(&B, K10); + aes_enc_block(&B, K11); + aes_enc_block(&B, K12); + aes_enc_block_last(&B, K13, K14); + vst1q_u8(buffer + 16 * i, B); + } +} + +VC_INLINE void aes256_hw_encrypt_block(uint8 buffer[], const uint8* ks) +{ + const uint8x16_t K0 = vld1q_u8(ks + 0 * 16); + const uint8x16_t K1 = vld1q_u8(ks + 1 * 16); + const uint8x16_t K2 = vld1q_u8(ks + 2 * 16); + const uint8x16_t K3 = vld1q_u8(ks + 3 * 16); + const uint8x16_t K4 = vld1q_u8(ks + 4 * 16); + const uint8x16_t K5 = vld1q_u8(ks + 5 * 16); + const uint8x16_t K6 = vld1q_u8(ks + 6 * 16); + const uint8x16_t K7 = vld1q_u8(ks + 7 * 16); + const uint8x16_t K8 = vld1q_u8(ks + 8 * 16); + const uint8x16_t K9 = vld1q_u8(ks + 9 * 16); + const uint8x16_t K10 = vld1q_u8(ks + 10 * 16); + const uint8x16_t K11 = vld1q_u8(ks + 11 * 16); + const uint8x16_t K12 = vld1q_u8(ks + 12 * 16); + const uint8x16_t K13 = vld1q_u8(ks + 13 * 16); + const uint8x16_t K14 = vld1q_u8(ks + 14 * 16); + + uint8x16_t B = vld1q_u8(buffer); + aes_enc_block(&B, K0); + aes_enc_block(&B, K1); + aes_enc_block(&B, K2); + aes_enc_block(&B, K3); + aes_enc_block(&B, K4); + aes_enc_block(&B, K5); + aes_enc_block(&B, K6); + aes_enc_block(&B, K7); + aes_enc_block(&B, K8); + aes_enc_block(&B, K9); + aes_enc_block(&B, K10); + aes_enc_block(&B, K11); + aes_enc_block(&B, K12); + aes_enc_block_last(&B, K13, K14); + vst1q_u8(buffer, B); +} + +VC_INLINE void aes256_hw_decrypt_blocks(uint8 buffer[], size_t blocks, const uint8* ks) +{ + const uint8x16_t K0 = vld1q_u8(ks + 0 * 16); + const uint8x16_t K1 = vld1q_u8(ks + 1 * 16); + const uint8x16_t K2 = vld1q_u8(ks + 2 * 16); + const uint8x16_t K3 = vld1q_u8(ks + 3 * 16); + const uint8x16_t K4 = vld1q_u8(ks + 4 * 16); + const uint8x16_t K5 = vld1q_u8(ks + 5 * 16); + const uint8x16_t K6 = vld1q_u8(ks + 6 * 16); + const uint8x16_t K7 = vld1q_u8(ks + 7 * 16); + const uint8x16_t K8 = vld1q_u8(ks + 8 * 16); + const uint8x16_t K9 = vld1q_u8(ks + 9 * 16); + const uint8x16_t K10 = vld1q_u8(ks + 10 * 16); + const uint8x16_t K11 = vld1q_u8(ks + 11 * 16); + const uint8x16_t K12 = vld1q_u8(ks + 12 * 16); + const uint8x16_t K13 = vld1q_u8(ks + 13 * 16); + const uint8x16_t K14 = vld1q_u8(ks + 14 * 16); + + while(blocks >= 4) { + uint8x16_t B0 = vld1q_u8(buffer); + uint8x16_t B1 = vld1q_u8(buffer + 16); + uint8x16_t B2 = vld1q_u8(buffer + 32); + uint8x16_t B3 = vld1q_u8(buffer + 48); + + aes_dec_4_blocks(&B0, &B1, &B2, &B3, K0); + aes_dec_4_blocks(&B0, &B1, &B2, &B3, K1); + aes_dec_4_blocks(&B0, &B1, &B2, &B3, K2); + aes_dec_4_blocks(&B0, &B1, &B2, &B3, K3); + aes_dec_4_blocks(&B0, &B1, &B2, &B3, K4); + aes_dec_4_blocks(&B0, &B1, &B2, &B3, K5); + aes_dec_4_blocks(&B0, &B1, &B2, &B3, K6); + aes_dec_4_blocks(&B0, &B1, &B2, &B3, K7); + aes_dec_4_blocks(&B0, &B1, &B2, &B3, K8); + aes_dec_4_blocks(&B0, &B1, &B2, &B3, K9); + aes_dec_4_blocks(&B0, &B1, &B2, &B3, K10); + aes_dec_4_blocks(&B0, &B1, &B2, &B3, K11); + aes_dec_4_blocks(&B0, &B1, &B2, &B3, K12); + aes_dec_4_blocks_last(&B0, &B1, &B2, &B3, K13, K14); + + vst1q_u8(buffer, B0); + vst1q_u8(buffer + 16, B1); + vst1q_u8(buffer + 32, B2); + vst1q_u8(buffer + 48, B3); + + buffer += 16 * 4; + blocks -= 4; + } + + for(size_t i = 0; i != blocks; ++i) { + uint8x16_t B = vld1q_u8(buffer + 16 * i); + aes_dec_block(&B, K0); + aes_dec_block(&B, K1); + aes_dec_block(&B, K2); + aes_dec_block(&B, K3); + aes_dec_block(&B, K4); + aes_dec_block(&B, K5); + aes_dec_block(&B, K6); + aes_dec_block(&B, K7); + aes_dec_block(&B, K8); + aes_dec_block(&B, K9); + aes_dec_block(&B, K10); + aes_dec_block(&B, K11); + aes_dec_block(&B, K12); + aes_dec_block_last(&B, K13, K14); + vst1q_u8(buffer + 16 * i, B); + } +} + +VC_INLINE void aes256_hw_decrypt_block(uint8 buffer[], const uint8* ks) +{ + const uint8x16_t K0 = vld1q_u8(ks + 0 * 16); + const uint8x16_t K1 = vld1q_u8(ks + 1 * 16); + const uint8x16_t K2 = vld1q_u8(ks + 2 * 16); + const uint8x16_t K3 = vld1q_u8(ks + 3 * 16); + const uint8x16_t K4 = vld1q_u8(ks + 4 * 16); + const uint8x16_t K5 = vld1q_u8(ks + 5 * 16); + const uint8x16_t K6 = vld1q_u8(ks + 6 * 16); + const uint8x16_t K7 = vld1q_u8(ks + 7 * 16); + const uint8x16_t K8 = vld1q_u8(ks + 8 * 16); + const uint8x16_t K9 = vld1q_u8(ks + 9 * 16); + const uint8x16_t K10 = vld1q_u8(ks + 10 * 16); + const uint8x16_t K11 = vld1q_u8(ks + 11 * 16); + const uint8x16_t K12 = vld1q_u8(ks + 12 * 16); + const uint8x16_t K13 = vld1q_u8(ks + 13 * 16); + const uint8x16_t K14 = vld1q_u8(ks + 14 * 16); + + uint8x16_t B = vld1q_u8(buffer); + aes_dec_block(&B, K0); + aes_dec_block(&B, K1); + aes_dec_block(&B, K2); + aes_dec_block(&B, K3); + aes_dec_block(&B, K4); + aes_dec_block(&B, K5); + aes_dec_block(&B, K6); + aes_dec_block(&B, K7); + aes_dec_block(&B, K8); + aes_dec_block(&B, K9); + aes_dec_block(&B, K10); + aes_dec_block(&B, K11); + aes_dec_block(&B, K12); + aes_dec_block_last(&B, K13, K14); + vst1q_u8(buffer, B); +} + +void aes_hw_cpu_decrypt (const uint8 *ks, uint8 *data) +{ + aes256_hw_decrypt_block(data, ks); +} + +void aes_hw_cpu_decrypt_32_blocks (const uint8 *ks, uint8 *data) +{ + aes256_hw_decrypt_blocks(data, 32, ks); +} + +void aes_hw_cpu_encrypt (const uint8 *ks, uint8 *data) +{ + aes256_hw_encrypt_block(data, ks); +} + +void aes_hw_cpu_encrypt_32_blocks (const uint8 *ks, uint8 *data) +{ + aes256_hw_encrypt_blocks(data, 32, ks); +} + +#endif |