6 #ifndef CRYPTOPP_ARM_SIMD_H
7 #define CRYPTOPP_ARM_SIMD_H
11 #if (CRYPTOPP_ARM_NEON_HEADER)
12 # include <arm_neon.h>
15 #if (CRYPTOPP_ARM_ACLE_HEADER)
17 # include <arm_acle.h>
20 #if (CRYPTOPP_ARM_PMULL_AVAILABLE) || defined(CRYPTOPP_DOXYGEN_PROCESSING)
34 inline uint64x2_t
PMULL_00(
const uint64x2_t a,
const uint64x2_t b)
37 const __n64 x = { vgetq_lane_u64(a, 0) };
38 const __n64 y = { vgetq_lane_u64(b, 0) };
39 return vmull_p64(x, y);
40 #elif defined(__GNUC__)
42 __asm __volatile(
"pmull %0.1q, %1.1d, %2.1d \n\t"
43 :
"=w" (r) :
"w" (a),
"w" (b) );
46 return (uint64x2_t)(vmull_p64(
47 vgetq_lane_u64(vreinterpretq_u64_u8(a),0),
48 vgetq_lane_u64(vreinterpretq_u64_u8(b),0)));
64 inline uint64x2_t
PMULL_01(
const uint64x2_t a,
const uint64x2_t b)
67 const __n64 x = { vgetq_lane_u64(a, 0) };
68 const __n64 y = { vgetq_lane_u64(b, 1) };
69 return vmull_p64(x, y);
70 #elif defined(__GNUC__)
72 __asm __volatile(
"pmull %0.1q, %1.1d, %2.1d \n\t"
73 :
"=w" (r) :
"w" (a),
"w" (vget_high_u64(b)) );
76 return (uint64x2_t)(vmull_p64(
77 vgetq_lane_u64(vreinterpretq_u64_u8(a),0),
78 vgetq_lane_u64(vreinterpretq_u64_u8(b),1)));
94 inline uint64x2_t
PMULL_10(
const uint64x2_t a,
const uint64x2_t b)
97 const __n64 x = { vgetq_lane_u64(a, 1) };
98 const __n64 y = { vgetq_lane_u64(b, 0) };
99 return vmull_p64(x, y);
100 #elif defined(__GNUC__)
102 __asm __volatile(
"pmull %0.1q, %1.1d, %2.1d \n\t"
103 :
"=w" (r) :
"w" (vget_high_u64(a)),
"w" (b) );
106 return (uint64x2_t)(vmull_p64(
107 vgetq_lane_u64(vreinterpretq_u64_u8(a),1),
108 vgetq_lane_u64(vreinterpretq_u64_u8(b),0)));
124 inline uint64x2_t
PMULL_11(
const uint64x2_t a,
const uint64x2_t b)
126 #if defined(_MSC_VER)
127 const __n64 x = { vgetq_lane_u64(a, 1) };
128 const __n64 y = { vgetq_lane_u64(b, 1) };
129 return vmull_p64(x, y);
130 #elif defined(__GNUC__)
132 __asm __volatile(
"pmull2 %0.1q, %1.2d, %2.2d \n\t"
133 :
"=w" (r) :
"w" (a),
"w" (b) );
136 return (uint64x2_t)(vmull_p64(
137 vgetq_lane_u64(vreinterpretq_u64_u8(a),1),
138 vgetq_lane_u64(vreinterpretq_u64_u8(b),1)));
150 inline uint64x2_t
VEXT_U8(uint64x2_t a, uint64x2_t b,
unsigned int c)
152 #if defined(_MSC_VER)
153 return (uint64x2_t)vextq_u8(
154 vreinterpretq_u8_u64(a), vreinterpretq_u8_u64(b), c);
157 __asm __volatile(
"ext %0.16b, %1.16b, %2.16b, %3 \n\t"
158 :
"=w" (r) :
"w" (a),
"w" (b),
"I" (c) );
171 template <
unsigned int C>
172 inline uint64x2_t
VEXT_U8(uint64x2_t a, uint64x2_t b)
175 #if defined(_MSC_VER)
176 return (uint64x2_t)vextq_u8(
177 vreinterpretq_u8_u64(a), vreinterpretq_u8_u64(b), C);
180 __asm __volatile(
"ext %0.16b, %1.16b, %2.16b, %3 \n\t"
181 :
"=w" (r) :
"w" (a),
"w" (b),
"I" (C) );
uint64x2_t PMULL_00(const uint64x2_t a, const uint64x2_t b)
Polynomial multiplication.
uint64x2_t PMULL_11(const uint64x2_t a, const uint64x2_t b)
Polynomial multiplication.
uint64x2_t PMULL_01(const uint64x2_t a, const uint64x2_t b)
Polynomial multiplication.
uint64x2_t PMULL_10(const uint64x2_t a, const uint64x2_t b)
Polynomial multiplication.
uint64x2_t VEXT_U8(uint64x2_t a, uint64x2_t b, unsigned int c)
Vector extraction.
Library configuration file.