1 /**
2 * BMI2 intrinsics.
3 * https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#othertechs=BMI2
4 *
5 * Copyright: Copyright Johan Engelen 2021.
6 * License: $(LINK2 http://www.boost.org/LICENSE_1_0.txt, Boost License 1.0)
7 */
8 module inteli.bmi2intrin;
9
10 import inteli.internals;
11
12 static if (LDC_with_BMI2 || GDC_with_BMI2)
13 {
14 private enum BMI2_builtins = true;
15 }
16 else
17 {
18 private enum BMI2_builtins = false;
19 }
20
21 nothrow @nogc pure @safe:
22
23 /// Copy all bits from unsigned 32-bit integer `a` to dst, and reset (set to 0) the high bits in dst starting at index.
24 uint _bzhi_u32 (uint a, uint index)
25 {
26 static if (BMI2_builtins)
27 {
28 if (!__ctfe)
29 return __builtin_ia32_bzhi_si(a, index);
30 else
31 return bzhi!uint(a, index);
32 }
33 else
34 {
35 return bzhi!uint(a, index);
36 }
37 }
38 unittest
39 {
40 static assert (_bzhi_u32(0x1234_5678, 5) == 0x18);
41 assert (_bzhi_u32(0x1234_5678, 5) == 0x18);
42 static assert (_bzhi_u32(0x1234_5678, 10) == 0x278);
43 assert (_bzhi_u32(0x1234_5678, 10) == 0x278);
44 static assert (_bzhi_u32(0x1234_5678, 21) == 0x14_5678);
45 assert (_bzhi_u32(0x1234_5678, 21) == 0x14_5678);
46 }
47
48 /// Copy all bits from unsigned 64-bit integer `a` to dst, and reset (set to 0) the high bits in dst starting at index.
49 ulong _bzhi_u64 (ulong a, uint index)
50 {
51 static if (BMI2_builtins)
52 {
53 if (!__ctfe)
54 {
55 version(X86_64)
56 {
57 // This instruction not available in 32-bit x86.
58 return __builtin_ia32_bzhi_di(a, index);
59 }
60 else
61 return bzhi!ulong(a, index);
62 }
63 else
64 return bzhi!ulong(a, index);
65 }
66 else
67 {
68 return bzhi!ulong(a, index);
69 }
70 }
71 unittest
72 {
73 static assert (_bzhi_u64(0x1234_5678, 5) == 0x18);
74 assert (_bzhi_u64(0x1234_5678, 5) == 0x18);
75 static assert (_bzhi_u64(0x1234_5678, 10) == 0x278);
76 assert (_bzhi_u64(0x1234_5678, 10) == 0x278);
77 static assert (_bzhi_u64(0x1234_5678, 21) == 0x14_5678);
78 assert (_bzhi_u64(0x1234_5678, 21) == 0x14_5678);
79 static assert (_bzhi_u64(0x8765_4321_1234_5678, 54) == 0x0025_4321_1234_5678);
80 assert (_bzhi_u64(0x8765_4321_1234_5678, 54) == 0x0025_4321_1234_5678);
81 }
82
83 // Helper function for BZHI
84 private T bzhi(T)(T a, uint index)
85 {
86 /+
87 n := index[7:0]
88 dst := a
89 IF (n < number of bits)
90 dst[MSB:n] := 0
91 FI
92 +/
93 enum numbits = T.sizeof*8;
94 T dst = a;
95 if (index < numbits)
96 {
97 T mask = (T(1) << index) - 1;
98 dst &= mask;
99 }
100 return dst;
101 }
102
103 /// Multiply unsigned 32-bit integers `a` and `b`, store the low 32-bits of the result in dst, and store the high 32-bits in `hi`. This does not read or write arithmetic flags.
104 /// TODO: the implementation _does_ set arithmetic flags, unless the x86 instruction mulx is indeed selected.
105 uint _mulx_u32 (uint a, uint b, uint* hi)
106 {
107 /+
108 dst[31:0] := (a * b)[31:0]
109 MEM[hi+31:hi] := (a * b)[63:32]
110 +/
111 ulong result = cast(ulong) a * b;
112 *hi = cast(uint) ((result >>> 32) & 0xFFFF_FFFF);
113 return cast(uint) (result & 0xFFFF_FFFF);
114 }
115 @system unittest
116 {
117 uint hi;
118 assert (_mulx_u32(0x1234_5678, 0x1234_5678, &hi) == 0x1DF4_D840);
119 assert (hi == 0x014B_66DC);
120 }
121
122 /// Multiply unsigned 64-bit integers `a` and `b`, store the low 64-bits of the result in dst, and store the high 64-bits in `hi`. This does not read or write arithmetic flags.
123 /// TODO: the implementation _does_ set arithmetic flags, unless the x86 instruction mulx is indeed selected.
124 ulong _mulx_u64 (ulong a, ulong b, ulong* hi)
125 {
126 /+
127 dst[63:0] := (a * b)[63:0]
128 MEM[hi+63:hi] := (a * b)[127:64]
129 +/
130
131 version(LDC)
132 {
133 // LDC x86: Generates mulx from -O0
134 enum ir = `
135 %4 = zext i64 %0 to i128
136 %5 = zext i64 %1 to i128
137 %6 = mul nuw i128 %5, %4
138 %7 = lshr i128 %6, 64
139 %8 = trunc i128 %7 to i64
140 store i64 %8, i64* %2, align 8
141 %9 = trunc i128 %6 to i64
142 ret i64 %9`;
143 return LDCInlineIR!(ir, ulong, ulong, ulong, ulong*)(a, b, hi);
144 }
145 else
146 {
147 /+ Straight-forward implementation with `ucent`:
148 ucent result = cast(ucent) a * b;
149 *hi = cast(ulong) ((result >>> 64) & 0xFFFF_FFFF_FFFF_FFFF);
150 return cast(ulong) (result & 0xFFFF_FFFF_FFFF_FFFF);
151 +/
152
153 /+
154 Implementation using 64bit math is more complex...
155 a * b = (a_high << 32 + a_low) * (b_high << 32 + b_low)
156 = (a_high << 32)*(b_high << 32) + (a_high << 32)*b_low + a_low* (b_high << 32) + a_low*b_low
157 = (a_high*b_high) << 64 + (a_high*b_low) << 32 + (a_low*b_high) << 32 + a_low*b_low
158 = c2 << 64 + c11 << 32 + c12 << 32 + c0
159 = z1 << 64 + z0
160 // The sums may overflow, so we need to carry the carry (from low 64bits to high 64bits). We can do that
161 // by separately creating the sum to get the high 32 bits of z0 using 64bit math. The high 32 bits of that
162 // intermediate result is then the 'carry' that we need to add when calculating z1's sum.
163 z0 = (c0 & 0xFFFF_FFFF) + (c0 >> 32 + c11 & 0xFFFF_FFFF + c12 & 0xFFFF_FFFF ) << 32
164 The carry part from z0's sum = (c0 >> 32 + c11 & 0xFFFF_FFFF + c12 & 0xFFFF_FFFF ) >> 32
165 z1 = c2 + (c11 >> 32 + c12 >> 32 + (c0 >> 32 + c11 & 0xFFFF_FFFF + c12 & 0xFFFF_FFFF ) >> 32
166 +/
167
168 const ulong a_low = a & 0xFFFF_FFFF;
169 const ulong a_high = a >>> 32;
170 const ulong b_low = b & 0xFFFF_FFFF;
171 const ulong b_high = b >>> 32;
172
173 const ulong c2 = a_high*b_high;
174 const ulong c11 = a_high*b_low;
175 const ulong c12 = a_low*b_high;
176 const ulong c0 = a_low*b_low;
177
178 const ulong common_term = (c0 >> 32) + (c11 & 0xFFFF_FFFF) + (c12 & 0xFFFF_FFFF);
179 const ulong z0 = (c0 & 0xFFFF_FFFF) + (common_term << 32);
180 const ulong z1 = c2 + (c11 >> 32) + (c12 >> 32) + (common_term >> 32);
181
182 *hi = z1;
183 return z0;
184 }
185 }
186 @system unittest
187 {
188 ulong hi;
189 // 0x1234_5678_9ABC_DEF0 * 0x1234_5678_9ABC_DEF0 == 0x14b_66dc_33f6_acdc_a5e2_0890_f2a5_2100
190 assert (_mulx_u64(0x1234_5678_9ABC_DEF0, 0x1234_5678_9ABC_DEF0, &hi) == 0xa5e2_0890_f2a5_2100);
191 assert (hi == 0x14b_66dc_33f6_acdc);
192 }
193
194 /// Deposit contiguous low bits from unsigned 32-bit integer `a` to dst at the corresponding bit locations specified by `mask`; all other bits in dst are set to zero.
195 uint _pdep_u32 (uint a, uint mask)
196 {
197 static if (BMI2_builtins)
198 {
199 if (!__ctfe)
200 return __builtin_ia32_pdep_si(a, mask);
201 else
202 return pdep!uint(a, mask);
203 }
204 else
205 {
206 return pdep!uint(a, mask);
207 }
208 }
209 unittest
210 {
211 static assert (_pdep_u32(0x1234_5678, 0x0F0F_0F0F) == 0x0506_0708);
212 assert (_pdep_u32(0x1234_5678, 0x0F0F_0F0F) == 0x0506_0708);
213 }
214
215 /// Deposit contiguous low bits from unsigned 64-bit integer `a` to dst at the corresponding bit locations specified by `mask`; all other bits in dst are set to zero.
216 ulong _pdep_u64 (ulong a, ulong mask)
217 {
218 static if (BMI2_builtins)
219 {
220 if (!__ctfe)
221 {
222 version(X86_64)
223 {
224 // This instruction not available in 32-bit x86.
225 return __builtin_ia32_pdep_di(a, mask);
226 }
227 else
228 return pdep!ulong(a, mask);
229 }
230 else
231 return pdep!ulong(a, mask);
232 }
233 else
234 {
235 return pdep!ulong(a, mask);
236 }
237 }
238 unittest
239 {
240 static assert (_pdep_u64(0x1234_5678_8765_4321, 0x0F0F_0F0F_0F0F_0F0F) == 0x0807_0605_0403_0201);
241 assert (_pdep_u64(0x1234_5678_8765_4321, 0x0F0F_0F0F_0F0F_0F0F) == 0x0807_0605_0403_0201);
242 }
243
244 // Helper function for PDEP
245 private T pdep(T)(T a, T mask)
246 {
247 /+
248 tmp := a
249 dst := 0
250 m := 0
251 k := 0
252 DO WHILE m < 32
253 IF mask[m] == 1
254 dst[m] := tmp[k]
255 k := k + 1
256 FI
257 m := m + 1
258 OD
259 +/
260 T dst;
261 T k_bitpos = 1;
262 T m_bitpos = 1; // for each iteration, this has one bit set to 1 in the position probed
263 foreach (m; 0..T.sizeof*8)
264 {
265 if (mask & m_bitpos)
266 {
267 dst |= (a & k_bitpos) ? m_bitpos : 0;
268 k_bitpos <<= 1;
269 }
270 m_bitpos <<= 1;
271 }
272 return dst;
273 }
274
275
276 /// Extract bits from unsigned 32-bit integer `a` at the corresponding bit locations specified by `mask` to contiguous low bits in dst; the remaining upper bits in dst are set to zero.
277 uint _pext_u32 (uint a, uint mask)
278 {
279 static if (BMI2_builtins)
280 {
281 if (!__ctfe)
282 return __builtin_ia32_pext_si(a, mask);
283 else
284 return pext!uint(a, mask);
285 }
286 else
287 {
288 return pext!uint(a, mask);
289 }
290 }
291 unittest
292 {
293 static assert (_pext_u32(0x1234_5678, 0x0F0F_0F0F) == 0x2468);
294 assert (_pext_u32(0x1234_5678, 0x0F0F_0F0F) == 0x2468);
295 }
296
297 /// Extract bits from unsigned 64-bit integer `a` at the corresponding bit locations specified by `mask` to contiguous low bits in dst; the remaining upper bits in dst are set to zero.
298 ulong _pext_u64 (ulong a, ulong mask)
299 {
300 static if (BMI2_builtins)
301 {
302 if (!__ctfe)
303 {
304 version(X86_64)
305 {
306 // This instruction not available in 32-bit x86.
307 return __builtin_ia32_pext_di(a, mask);
308 }
309 else
310 return pext!ulong(a, mask);
311 }
312 else
313 return pext!ulong(a, mask);
314 }
315 else
316 {
317 return pext!ulong(a, mask);
318 }
319 }
320 unittest
321 {
322 static assert (_pext_u64(0x1234_5678_8765_4321, 0x0F0F_0F0F_0F0F_0F0F) == 0x2468_7531);
323 assert (_pext_u64(0x1234_5678_8765_4321, 0x0F0F_0F0F_0F0F_0F0F) == 0x2468_7531);
324 }
325
326 // Helper function for PEXT
327 private T pext(T)(T a, T mask)
328 {
329 /+
330 tmp := a
331 dst := 0
332 m := 0
333 k := 0
334 DO WHILE m < number of bits in T
335 IF mask[m] == 1
336 dst[k] := tmp[m]
337 k := k + 1
338 FI
339 m := m + 1
340 OD
341 +/
342 T dst;
343 T k_bitpos = 1;
344 T m_bitpos = 1; // for each iteration, this has one bit set to 1 in the position probed
345 foreach (m; 0..T.sizeof*8)
346 {
347 if (mask & m_bitpos)
348 {
349 dst |= (a & m_bitpos) ? k_bitpos : 0;
350 k_bitpos <<= 1;
351 }
352 m_bitpos <<= 1;
353 }
354 return dst;
355 }