27 #if (CRYPTOPP_AESNI_AVAILABLE)
29 # include <emmintrin.h>
30 # include <smmintrin.h>
31 # include <wmmintrin.h>
34 #if (CRYPTOPP_ARM_NEON_HEADER)
36 # include <arm_neon.h>
39 #if (CRYPTOPP_ARM_ACLE_HEADER)
42 # include <arm_acle.h>
49 #if defined(CRYPTOPP_POWER8_AES_AVAILABLE)
54 #ifdef CRYPTOPP_GNU_STYLE_INLINE_ASSEMBLY
59 #ifndef EXCEPTION_EXECUTE_HANDLER
60 # define EXCEPTION_EXECUTE_HANDLER 1
64 #define M128_CAST(x) ((__m128i *)(void *)(x))
65 #define CONST_M128_CAST(x) ((const __m128i *)(const void *)(x))
68 extern const char RIJNDAEL_SIMD_FNAME[] = __FILE__;
74 #ifdef CRYPTOPP_GNU_STYLE_INLINE_ASSEMBLY
76 typedef void (*SigHandler)(int);
78 static jmp_buf s_jmpSIGILL;
79 static void SigIllHandler(
int)
81 longjmp(s_jmpSIGILL, 1);
86 #if (CRYPTOPP_BOOL_ARM32 || CRYPTOPP_BOOL_ARMV8)
89 #if defined(CRYPTOPP_NO_CPU_FEATURE_PROBES)
91 #elif (CRYPTOPP_ARM_AES_AVAILABLE)
92 # if defined(CRYPTOPP_MS_STYLE_INLINE_ASSEMBLY)
93 volatile bool result =
true;
97 uint8x16_t data = vdupq_n_u8(0), key = vdupq_n_u8(0);
98 uint8x16_t r1 = vaeseq_u8(data, key);
99 uint8x16_t r2 = vaesdq_u8(data, key);
101 r2 = vaesimcq_u8(r2);
103 result = !!(vgetq_lane_u8(r1,0) | vgetq_lane_u8(r2,7));
105 __except (EXCEPTION_EXECUTE_HANDLER)
114 volatile bool result =
true;
116 volatile SigHandler oldHandler = signal(SIGILL, SigIllHandler);
117 if (oldHandler == SIG_ERR)
120 volatile sigset_t oldMask;
121 if (sigprocmask(0, NULLPTR, (sigset_t*)&oldMask))
123 signal(SIGILL, oldHandler);
127 if (setjmp(s_jmpSIGILL))
131 uint8x16_t data = vdupq_n_u8(0), key = vdupq_n_u8(0);
132 uint8x16_t r1 = vaeseq_u8(data, key);
133 uint8x16_t r2 = vaesdq_u8(data, key);
135 r2 = vaesimcq_u8(r2);
138 result = !!(vgetq_lane_u8(r1,0) | vgetq_lane_u8(r2,7));
141 sigprocmask(SIG_SETMASK, (sigset_t*)&oldMask, NULLPTR);
142 signal(SIGILL, oldHandler);
153 #if (CRYPTOPP_ARM_AES_AVAILABLE)
155 ANONYMOUS_NAMESPACE_BEGIN
157 inline void ARMV8_Enc_Block(uint64x2_t &data,
const word32 *subkeys,
unsigned int rounds)
160 const byte *keys =
reinterpret_cast<const byte*
>(subkeys);
161 uint8x16_t block = vreinterpretq_u8_u64(data);
164 block = vaeseq_u8(block, vld1q_u8(keys+0*16));
166 block = vaesmcq_u8(block);
168 for (
unsigned int i=1; i<rounds-1; i+=2)
171 block = vaeseq_u8(block, vld1q_u8(keys+i*16));
173 block = vaesmcq_u8(block);
175 block = vaeseq_u8(block, vld1q_u8(keys+(i+1)*16));
177 block = vaesmcq_u8(block);
181 block = vaeseq_u8(block, vld1q_u8(keys+(rounds-1)*16));
183 block = veorq_u8(block, vld1q_u8(keys+rounds*16));
185 data = vreinterpretq_u64_u8(block);
188 inline void ARMV8_Enc_6_Blocks(uint64x2_t &data0, uint64x2_t &data1,
189 uint64x2_t &data2, uint64x2_t &data3, uint64x2_t &data4, uint64x2_t &data5,
190 const word32 *subkeys,
unsigned int rounds)
193 const byte *keys =
reinterpret_cast<const byte*
>(subkeys);
195 uint8x16_t block0 = vreinterpretq_u8_u64(data0);
196 uint8x16_t block1 = vreinterpretq_u8_u64(data1);
197 uint8x16_t block2 = vreinterpretq_u8_u64(data2);
198 uint8x16_t block3 = vreinterpretq_u8_u64(data3);
199 uint8x16_t block4 = vreinterpretq_u8_u64(data4);
200 uint8x16_t block5 = vreinterpretq_u8_u64(data5);
203 for (
unsigned int i=0; i<rounds-1; ++i)
205 key = vld1q_u8(keys+i*16);
207 block0 = vaeseq_u8(block0, key);
209 block0 = vaesmcq_u8(block0);
211 block1 = vaeseq_u8(block1, key);
213 block1 = vaesmcq_u8(block1);
215 block2 = vaeseq_u8(block2, key);
217 block2 = vaesmcq_u8(block2);
219 block3 = vaeseq_u8(block3, key);
221 block3 = vaesmcq_u8(block3);
223 block4 = vaeseq_u8(block4, key);
225 block4 = vaesmcq_u8(block4);
227 block5 = vaeseq_u8(block5, key);
229 block5 = vaesmcq_u8(block5);
233 key = vld1q_u8(keys+(rounds-1)*16);
234 block0 = vaeseq_u8(block0, key);
235 block1 = vaeseq_u8(block1, key);
236 block2 = vaeseq_u8(block2, key);
237 block3 = vaeseq_u8(block3, key);
238 block4 = vaeseq_u8(block4, key);
239 block5 = vaeseq_u8(block5, key);
242 key = vld1q_u8(keys+rounds*16);
243 data0 = vreinterpretq_u64_u8(veorq_u8(block0, key));
244 data1 = vreinterpretq_u64_u8(veorq_u8(block1, key));
245 data2 = vreinterpretq_u64_u8(veorq_u8(block2, key));
246 data3 = vreinterpretq_u64_u8(veorq_u8(block3, key));
247 data4 = vreinterpretq_u64_u8(veorq_u8(block4, key));
248 data5 = vreinterpretq_u64_u8(veorq_u8(block5, key));
251 inline void ARMV8_Dec_Block(uint64x2_t &data,
const word32 *subkeys,
unsigned int rounds)
254 const byte *keys =
reinterpret_cast<const byte*
>(subkeys);
255 uint8x16_t block = vreinterpretq_u8_u64(data);
258 block = vaesdq_u8(block, vld1q_u8(keys+0*16));
260 block = vaesimcq_u8(block);
262 for (
unsigned int i=1; i<rounds-1; i+=2)
265 block = vaesdq_u8(block, vld1q_u8(keys+i*16));
267 block = vaesimcq_u8(block);
269 block = vaesdq_u8(block, vld1q_u8(keys+(i+1)*16));
271 block = vaesimcq_u8(block);
275 block = vaesdq_u8(block, vld1q_u8(keys+(rounds-1)*16));
277 block = veorq_u8(block, vld1q_u8(keys+rounds*16));
279 data = vreinterpretq_u64_u8(block);
282 inline void ARMV8_Dec_6_Blocks(uint64x2_t &data0, uint64x2_t &data1,
283 uint64x2_t &data2, uint64x2_t &data3, uint64x2_t &data4, uint64x2_t &data5,
284 const word32 *subkeys,
unsigned int rounds)
287 const byte *keys =
reinterpret_cast<const byte*
>(subkeys);
289 uint8x16_t block0 = vreinterpretq_u8_u64(data0);
290 uint8x16_t block1 = vreinterpretq_u8_u64(data1);
291 uint8x16_t block2 = vreinterpretq_u8_u64(data2);
292 uint8x16_t block3 = vreinterpretq_u8_u64(data3);
293 uint8x16_t block4 = vreinterpretq_u8_u64(data4);
294 uint8x16_t block5 = vreinterpretq_u8_u64(data5);
297 for (
unsigned int i=0; i<rounds-1; ++i)
299 key = vld1q_u8(keys+i*16);
301 block0 = vaesdq_u8(block0, key);
303 block0 = vaesimcq_u8(block0);
305 block1 = vaesdq_u8(block1, key);
307 block1 = vaesimcq_u8(block1);
309 block2 = vaesdq_u8(block2, key);
311 block2 = vaesimcq_u8(block2);
313 block3 = vaesdq_u8(block3, key);
315 block3 = vaesimcq_u8(block3);
317 block4 = vaesdq_u8(block4, key);
319 block4 = vaesimcq_u8(block4);
321 block5 = vaesdq_u8(block5, key);
323 block5 = vaesimcq_u8(block5);
327 key = vld1q_u8(keys+(rounds-1)*16);
328 block0 = vaesdq_u8(block0, key);
329 block1 = vaesdq_u8(block1, key);
330 block2 = vaesdq_u8(block2, key);
331 block3 = vaesdq_u8(block3, key);
332 block4 = vaesdq_u8(block4, key);
333 block5 = vaesdq_u8(block5, key);
336 key = vld1q_u8(keys+rounds*16);
337 data0 = vreinterpretq_u64_u8(veorq_u8(block0, key));
338 data1 = vreinterpretq_u64_u8(veorq_u8(block1, key));
339 data2 = vreinterpretq_u64_u8(veorq_u8(block2, key));
340 data3 = vreinterpretq_u64_u8(veorq_u8(block3, key));
341 data4 = vreinterpretq_u64_u8(veorq_u8(block4, key));
342 data5 = vreinterpretq_u64_u8(veorq_u8(block5, key));
345 ANONYMOUS_NAMESPACE_END
347 size_t Rijndael_Enc_AdvancedProcessBlocks_ARMV8(
const word32 *subKeys,
size_t rounds,
348 const byte *inBlocks,
const byte *xorBlocks,
byte *outBlocks,
size_t length,
word32 flags)
351 subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags);
354 size_t Rijndael_Dec_AdvancedProcessBlocks_ARMV8(
const word32 *subKeys,
size_t rounds,
355 const byte *inBlocks,
const byte *xorBlocks,
byte *outBlocks,
size_t length,
word32 flags)
358 subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags);
365 #if (CRYPTOPP_AESNI_AVAILABLE)
367 ANONYMOUS_NAMESPACE_BEGIN
370 CRYPTOPP_ALIGN_DATA(16)
371 const
word32 s_rconLE[] = {
372 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x1B, 0x36
375 inline void AESNI_Enc_Block(__m128i &block,
MAYBE_CONST word32 *subkeys,
unsigned int rounds)
377 const __m128i* skeys =
reinterpret_cast<const __m128i*
>(subkeys);
379 block = _mm_xor_si128(block, skeys[0]);
380 for (
unsigned int i=1; i<rounds-1; i+=2)
382 block = _mm_aesenc_si128(block, skeys[i]);
383 block = _mm_aesenc_si128(block, skeys[i+1]);
385 block = _mm_aesenc_si128(block, skeys[rounds-1]);
386 block = _mm_aesenclast_si128(block, skeys[rounds]);
389 inline void AESNI_Enc_4_Blocks(__m128i &block0, __m128i &block1, __m128i &block2, __m128i &block3,
392 const __m128i* skeys =
reinterpret_cast<const __m128i*
>(subkeys);
394 __m128i rk = skeys[0];
395 block0 = _mm_xor_si128(block0, rk);
396 block1 = _mm_xor_si128(block1, rk);
397 block2 = _mm_xor_si128(block2, rk);
398 block3 = _mm_xor_si128(block3, rk);
399 for (
unsigned int i=1; i<rounds; i++)
402 block0 = _mm_aesenc_si128(block0, rk);
403 block1 = _mm_aesenc_si128(block1, rk);
404 block2 = _mm_aesenc_si128(block2, rk);
405 block3 = _mm_aesenc_si128(block3, rk);
408 block0 = _mm_aesenclast_si128(block0, rk);
409 block1 = _mm_aesenclast_si128(block1, rk);
410 block2 = _mm_aesenclast_si128(block2, rk);
411 block3 = _mm_aesenclast_si128(block3, rk);
414 inline void AESNI_Dec_Block(__m128i &block,
MAYBE_CONST word32 *subkeys,
unsigned int rounds)
416 const __m128i* skeys =
reinterpret_cast<const __m128i*
>(subkeys);
418 block = _mm_xor_si128(block, skeys[0]);
419 for (
unsigned int i=1; i<rounds-1; i+=2)
421 block = _mm_aesdec_si128(block, skeys[i]);
422 block = _mm_aesdec_si128(block, skeys[i+1]);
424 block = _mm_aesdec_si128(block, skeys[rounds-1]);
425 block = _mm_aesdeclast_si128(block, skeys[rounds]);
428 inline void AESNI_Dec_4_Blocks(__m128i &block0, __m128i &block1, __m128i &block2, __m128i &block3,
431 const __m128i* skeys =
reinterpret_cast<const __m128i*
>(subkeys);
433 __m128i rk = skeys[0];
434 block0 = _mm_xor_si128(block0, rk);
435 block1 = _mm_xor_si128(block1, rk);
436 block2 = _mm_xor_si128(block2, rk);
437 block3 = _mm_xor_si128(block3, rk);
438 for (
unsigned int i=1; i<rounds; i++)
441 block0 = _mm_aesdec_si128(block0, rk);
442 block1 = _mm_aesdec_si128(block1, rk);
443 block2 = _mm_aesdec_si128(block2, rk);
444 block3 = _mm_aesdec_si128(block3, rk);
447 block0 = _mm_aesdeclast_si128(block0, rk);
448 block1 = _mm_aesdeclast_si128(block1, rk);
449 block2 = _mm_aesdeclast_si128(block2, rk);
450 block3 = _mm_aesdeclast_si128(block3, rk);
453 ANONYMOUS_NAMESPACE_END
455 void Rijndael_UncheckedSetKey_SSE4_AESNI(
const byte *userKey,
size_t keyLen,
word32 *rk)
457 const size_t rounds = keyLen / 4 + 6;
458 const word32 *rc = s_rconLE;
460 __m128i temp = _mm_loadu_si128(
M128_CAST(userKey+keyLen-16));
461 std::memcpy(rk, userKey, keyLen);
464 const size_t keySize = 4*(rounds+1);
465 const word32* end = rk + keySize;
469 rk[keyLen/4] = rk[0] ^ _mm_extract_epi32(_mm_aeskeygenassist_si128(temp, 0), 3) ^ *(rc++);
470 rk[keyLen/4+1] = rk[1] ^ rk[keyLen/4];
471 rk[keyLen/4+2] = rk[2] ^ rk[keyLen/4+1];
472 rk[keyLen/4+3] = rk[3] ^ rk[keyLen/4+2];
474 if (rk + keyLen/4 + 4 == end)
479 rk[10] = rk[ 4] ^ rk[ 9];
480 rk[11] = rk[ 5] ^ rk[10];
481 temp = _mm_insert_epi32(temp, rk[11], 3);
483 else if (keyLen == 32)
485 temp = _mm_insert_epi32(temp, rk[11], 3);
486 rk[12] = rk[ 4] ^ _mm_extract_epi32(_mm_aeskeygenassist_si128(temp, 0), 2);
487 rk[13] = rk[ 5] ^ rk[12];
488 rk[14] = rk[ 6] ^ rk[13];
489 rk[15] = rk[ 7] ^ rk[14];
490 temp = _mm_insert_epi32(temp, rk[15], 3);
494 temp = _mm_insert_epi32(temp, rk[7], 3);
501 void Rijndael_UncheckedSetKeyRev_AESNI(
word32 *key,
unsigned int rounds)
508 for (i = 4, j = 4*rounds-4; i < j; i += 4, j -= 4)
510 temp = _mm_aesimc_si128(*
M128_CAST(key+i));
518 size_t Rijndael_Enc_AdvancedProcessBlocks_AESNI(
const word32 *subKeys,
size_t rounds,
519 const byte *inBlocks,
const byte *xorBlocks,
byte *outBlocks,
size_t length,
word32 flags)
527 sk, rounds, ib, xb, outBlocks, length, flags);
530 size_t Rijndael_Dec_AdvancedProcessBlocks_AESNI(
const word32 *subKeys,
size_t rounds,
531 const byte *inBlocks,
const byte *xorBlocks,
byte *outBlocks,
size_t length,
word32 flags)
538 sk, rounds, ib, xb, outBlocks, length, flags);
545 #if (CRYPTOPP_POWER8_AES_AVAILABLE)
547 ANONYMOUS_NAMESPACE_BEGIN
550 CRYPTOPP_ALIGN_DATA(16)
551 static const uint32_t s_rconBE[] = {
552 0x01000000, 0x02000000, 0x04000000, 0x08000000,
553 0x10000000, 0x20000000, 0x40000000, 0x80000000,
554 0x1B000000, 0x36000000
557 inline void POWER8_Enc_Block(
uint32x4_p &block,
const word32 *subkeys,
unsigned int rounds)
560 const byte *keys =
reinterpret_cast<const byte*
>(subkeys);
565 for (
size_t i=1; i<rounds-1; i+=2)
580 const byte *keys =
reinterpret_cast<const byte*
>(subkeys);
583 block0 =
VecXor(block0, k);
584 block1 =
VecXor(block1, k);
585 block2 =
VecXor(block2, k);
586 block3 =
VecXor(block3, k);
587 block4 =
VecXor(block4, k);
588 block5 =
VecXor(block5, k);
590 for (
size_t i=1; i<rounds; ++i)
610 inline void POWER8_Dec_Block(
uint32x4_p &block,
const word32 *subkeys,
unsigned int rounds)
613 const byte *keys =
reinterpret_cast<const byte*
>(subkeys);
618 for (
size_t i=rounds-1; i>1; i-=2)
633 const byte *keys =
reinterpret_cast<const byte*
>(subkeys);
636 block0 =
VecXor(block0, k);
637 block1 =
VecXor(block1, k);
638 block2 =
VecXor(block2, k);
639 block3 =
VecXor(block3, k);
640 block4 =
VecXor(block4, k);
641 block5 =
VecXor(block5, k);
643 for (
size_t i=rounds-1; i>0; --i)
663 ANONYMOUS_NAMESPACE_END
665 void Rijndael_UncheckedSetKey_POWER8(
const byte* userKey,
size_t keyLen,
word32* rk,
const byte* Se)
667 const size_t rounds = keyLen / 4 + 6;
668 const word32 *rc = s_rconBE;
674 const size_t keySize = 4*(rounds+1);
675 const word32* end = rkey + keySize;
679 temp = rkey[keyLen/4-1];
680 word32 x = (
word32(Se[GETBYTE(temp, 2)]) << 24) ^ (
word32(Se[GETBYTE(temp, 1)]) << 16) ^
681 (
word32(Se[GETBYTE(temp, 0)]) << 8) ^ Se[GETBYTE(temp, 3)];
682 rkey[keyLen/4] = rkey[0] ^ x ^ *(rc++);
683 rkey[keyLen/4+1] = rkey[1] ^ rkey[keyLen/4];
684 rkey[keyLen/4+2] = rkey[2] ^ rkey[keyLen/4+1];
685 rkey[keyLen/4+3] = rkey[3] ^ rkey[keyLen/4+2];
687 if (rkey + keyLen/4 + 4 == end)
692 rkey[10] = rkey[ 4] ^ rkey[ 9];
693 rkey[11] = rkey[ 5] ^ rkey[10];
695 else if (keyLen == 32)
698 rkey[12] = rkey[ 4] ^ (
word32(Se[GETBYTE(temp, 3)]) << 24) ^ (
word32(Se[GETBYTE(temp, 2)]) << 16) ^ (
word32(Se[GETBYTE(temp, 1)]) << 8) ^ Se[GETBYTE(temp, 0)];
699 rkey[13] = rkey[ 5] ^ rkey[12];
700 rkey[14] = rkey[ 6] ^ rkey[13];
701 rkey[15] = rkey[ 7] ^ rkey[14];
706 #if (CRYPTOPP_LITTLE_ENDIAN)
708 const uint8x16_p mask = {12,13,14,15, 8,9,10,11, 4,5,6,7, 0,1,2,3};
711 for (i=0; i<rounds; i+=2, rkey+=8)
717 for ( ; i<rounds+1; i++, rkey+=4)
722 size_t Rijndael_Enc_AdvancedProcessBlocks128_6x1_ALTIVEC(
const word32 *subKeys,
size_t rounds,
723 const byte *inBlocks,
const byte *xorBlocks,
byte *outBlocks,
size_t length,
word32 flags)
726 subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags);
729 size_t Rijndael_Dec_AdvancedProcessBlocks128_6x1_ALTIVEC(
const word32 *subKeys,
size_t rounds,
730 const byte *inBlocks,
const byte *xorBlocks,
byte *outBlocks,
size_t length,
word32 flags)
733 subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags);
Template for AdvancedProcessBlocks and SIMD processing.
#define MAYBE_UNCONST_CAST(T, x)
SunCC workaround.
size_t AdvancedProcessBlocks128_6x1_ALTIVEC(F1 func1, F6 func6, const W *subKeys, size_t rounds, const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
AdvancedProcessBlocks for 1 and 6 blocks.
#define M128_CAST(x)
Clang workaround.
size_t AdvancedProcessBlocks128_6x1_NEON(F1 func1, F6 func6, const W *subKeys, size_t rounds, const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
AdvancedProcessBlocks for 1 and 6 blocks.
#define MAYBE_CONST
SunCC workaround.
size_t AdvancedProcessBlocks128_4x1_SSE(F1 func1, F4 func4, const W *subKeys, size_t rounds, const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
AdvancedProcessBlocks for 1 and 4 blocks.
Library configuration file.
unsigned int word32
32-bit unsigned datatype
@ BIG_ENDIAN_ORDER
byte order is big-endian
Utility functions for the Crypto++ library.
bool IsAlignedOn(const void *ptr, unsigned int alignment)
Determines whether ptr is aligned to a minimum value.
void vec_swap(T &a, T &b)
Swaps two variables which are arrays.
Crypto++ library namespace.
Support functions for PowerPC and vector operations.
uint32x4_p VecLoadAligned(const byte src[16])
Loads a vector from an aligned byte array.
__vector unsigned int uint32x4_p
Vector of 32-bit elements.
T1 VecPermute(const T1 vec, const T2 mask)
Permutes a vector.
__vector unsigned char uint8x16_p
Vector of 8-bit elements.
T1 VecXor(const T1 vec1, const T2 vec2)
XOR two vectors.
T1 VecEncryptLast(const T1 state, const T2 key)
Final round of AES encryption.
T1 VecEncrypt(const T1 state, const T2 key)
One round of AES encryption.
T1 VecDecryptLast(const T1 state, const T2 key)
Final round of AES decryption.
T1 VecDecrypt(const T1 state, const T2 key)
One round of AES decryption.
void VecStore(const T data, byte dest[16])
Stores a vector to a byte array.
uint32x4_p VecLoad(const byte src[16])
Loads a vector from a byte array.
#define CRYPTOPP_ASSERT(exp)
Debugging and diagnostic assertion.