1 /**
2 * BMI2 intrinsics.
3 * https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#othertechs=BMI2
4 *
5 * Copyright: Copyright Johan Engelen 2021.
6 * License:   $(LINK2 http://www.boost.org/LICENSE_1_0.txt, Boost License 1.0)
7 */
8 module inteli.bmi2intrin;
9 
10 import inteli.internals;
11 
12 static if (LDC_with_BMI2 || GDC_with_BMI2)
13 {
14     private enum BMI2_builtins = true;
15 }
16 else
17 {
18     private enum BMI2_builtins = false;
19 }
20 
21 nothrow @nogc pure @safe:
22 
23 /// Copy all bits from unsigned 32-bit integer `a` to dst, and reset (set to 0) the high bits in dst starting at index.
24 uint _bzhi_u32 (uint a, uint index)
25 {
26     static if (BMI2_builtins)
27     {
28         if (!__ctfe)
29             return __builtin_ia32_bzhi_si(a, index);
30         else
31             return bzhi!uint(a, index);
32     }
33     else
34     {
35         return bzhi!uint(a, index);
36     }
37 }
38 unittest
39 {
40     static assert (_bzhi_u32(0x1234_5678, 5) == 0x18);
41            assert (_bzhi_u32(0x1234_5678, 5) == 0x18);
42     static assert (_bzhi_u32(0x1234_5678, 10) == 0x278);
43            assert (_bzhi_u32(0x1234_5678, 10) == 0x278);
44     static assert (_bzhi_u32(0x1234_5678, 21) == 0x14_5678);
45            assert (_bzhi_u32(0x1234_5678, 21) == 0x14_5678);
46 }
47 
48 /// Copy all bits from unsigned 64-bit integer `a` to dst, and reset (set to 0) the high bits in dst starting at index.
49 ulong _bzhi_u64 (ulong a, uint index)
50 {
51     static if (BMI2_builtins)
52     {
53         if (!__ctfe)
54         {
55             version(X86_64)
56             {
57                 // This instruction not available in 32-bit x86.
58                 return __builtin_ia32_bzhi_di(a, index);
59             }
60             else
61                 return bzhi!ulong(a, index);
62         }
63         else
64             return bzhi!ulong(a, index);
65     }
66     else
67     {
68         return bzhi!ulong(a, index);
69     }
70 }
71 unittest
72 {
73     static assert (_bzhi_u64(0x1234_5678, 5) == 0x18);
74            assert (_bzhi_u64(0x1234_5678, 5) == 0x18);
75     static assert (_bzhi_u64(0x1234_5678, 10) == 0x278);
76            assert (_bzhi_u64(0x1234_5678, 10) == 0x278);
77     static assert (_bzhi_u64(0x1234_5678, 21) == 0x14_5678);
78            assert (_bzhi_u64(0x1234_5678, 21) == 0x14_5678);
79     static assert (_bzhi_u64(0x8765_4321_1234_5678, 54) == 0x0025_4321_1234_5678);
80            assert (_bzhi_u64(0x8765_4321_1234_5678, 54) == 0x0025_4321_1234_5678);
81 }
82 
83 // Helper function for BZHI
84 private T bzhi(T)(T a, uint index)
85 {
86     /+
87         n := index[7:0]
88         dst := a
89         IF (n < number of bits)
90             dst[MSB:n] := 0
91         FI
92     +/
93     enum numbits = T.sizeof*8;
94     T dst = a;
95     if (index < numbits)
96     {
97         T mask = (T(1) << index) - 1;
98         dst &= mask;
99     }
100     return dst;
101 }
102 
103 /// Multiply unsigned 32-bit integers `a` and `b`, store the low 32-bits of the result in dst, and store the high 32-bits in `hi`. This does not read or write arithmetic flags.
104 /// TODO: the implementation _does_ set arithmetic flags, unless the x86 instruction mulx is indeed selected.
105 uint _mulx_u32 (uint a, uint b, uint* hi)
106 {
107     /+
108         dst[31:0] := (a * b)[31:0]
109         MEM[hi+31:hi] := (a * b)[63:32]
110     +/
111     ulong result = cast(ulong) a * b;
112     *hi = cast(uint) ((result >>> 32) & 0xFFFF_FFFF);
113     return cast(uint) (result & 0xFFFF_FFFF);
114 }
115 @system unittest
116 {
117     uint hi;
118     assert (_mulx_u32(0x1234_5678, 0x1234_5678, &hi) == 0x1DF4_D840);
119     assert (hi == 0x014B_66DC);
120 }
121 
122 /// Multiply unsigned 64-bit integers `a` and `b`, store the low 64-bits of the result in dst, and store the high 64-bits in `hi`. This does not read or write arithmetic flags.
123 /// TODO: the implementation _does_ set arithmetic flags, unless the x86 instruction mulx is indeed selected.
124 ulong _mulx_u64 (ulong a, ulong b, ulong* hi)
125 {
126     /+
127         dst[63:0] := (a * b)[63:0]
128         MEM[hi+63:hi]  := (a * b)[127:64]
129     +/
130 
131     version(LDC)
132     {
133         // LDC x86: Generates mulx from -O0
134         enum ir = `
135             %4 = zext i64 %0 to i128
136             %5 = zext i64 %1 to i128
137             %6 = mul nuw i128 %5, %4
138             %7 = lshr i128 %6, 64
139             %8 = trunc i128 %7 to i64
140             store i64 %8, i64* %2, align 8
141             %9 = trunc i128 %6 to i64
142             ret i64 %9`;
143         return LDCInlineIR!(ir, ulong, ulong, ulong, ulong*)(a, b, hi);
144     }
145     else
146     {
147         /+ Straight-forward implementation with `ucent`:
148         ucent result = cast(ucent) a * b;
149         *hi = cast(ulong) ((result >>> 64) & 0xFFFF_FFFF_FFFF_FFFF);
150         return cast(ulong) (result & 0xFFFF_FFFF_FFFF_FFFF);
151         +/
152 
153         /+
154             Implementation using 64bit math is more complex...
155             a * b = (a_high << 32 + a_low) * (b_high << 32 + b_low)
156                   = (a_high << 32)*(b_high << 32) + (a_high << 32)*b_low + a_low* (b_high << 32) + a_low*b_low
157                   = (a_high*b_high) << 64 + (a_high*b_low) << 32 + (a_low*b_high) << 32 + a_low*b_low
158                   = c2 << 64 + c11 << 32 + c12 << 32 + c0
159                   = z1 << 64  +  z0
160         // The sums may overflow, so we need to carry the carry (from low 64bits to high 64bits). We can do that
161         // by separately creating the sum to get the high 32 bits of z0 using 64bit math. The high 32 bits of that
162         // intermediate result is then the 'carry' that we need to add when calculating z1's sum.
163             z0 = (c0 & 0xFFFF_FFFF) + (c0 >> 32 + c11 & 0xFFFF_FFFF + c12 & 0xFFFF_FFFF ) << 32
164         The carry part from z0's sum = (c0 >> 32 + c11 & 0xFFFF_FFFF + c12 & 0xFFFF_FFFF ) >> 32
165             z1 = c2 + (c11 >> 32 + c12 >> 32 + (c0 >> 32 + c11 & 0xFFFF_FFFF + c12 & 0xFFFF_FFFF ) >> 32
166         +/
167 
168         const ulong a_low = a & 0xFFFF_FFFF;
169         const ulong a_high = a >>> 32;
170         const ulong b_low = b & 0xFFFF_FFFF;
171         const ulong b_high = b >>> 32;
172 
173         const ulong c2 = a_high*b_high;
174         const ulong c11 = a_high*b_low;
175         const ulong c12 = a_low*b_high;
176         const ulong c0 = a_low*b_low;
177 
178         const ulong common_term = (c0 >> 32) + (c11 & 0xFFFF_FFFF) + (c12 & 0xFFFF_FFFF);
179         const ulong z0 = (c0 & 0xFFFF_FFFF) + (common_term << 32);
180         const ulong z1 = c2 + (c11 >> 32) + (c12 >> 32) + (common_term >> 32);
181 
182         *hi = z1;
183         return z0;
184     }
185 }
186 @system unittest
187 {
188     ulong hi;
189     // 0x1234_5678_9ABC_DEF0 * 0x1234_5678_9ABC_DEF0 == 0x14b_66dc_33f6_acdc_a5e2_0890_f2a5_2100
190     assert (_mulx_u64(0x1234_5678_9ABC_DEF0, 0x1234_5678_9ABC_DEF0, &hi) == 0xa5e2_0890_f2a5_2100);
191     assert (hi == 0x14b_66dc_33f6_acdc);
192 }
193 
194 /// Deposit contiguous low bits from unsigned 32-bit integer `a` to dst at the corresponding bit locations specified by `mask`; all other bits in dst are set to zero.
195 uint _pdep_u32 (uint a, uint mask)
196 {
197     static if (BMI2_builtins)
198     {
199         if (!__ctfe)
200             return __builtin_ia32_pdep_si(a, mask);
201         else
202             return pdep!uint(a, mask);
203     }
204     else
205     {
206         return pdep!uint(a, mask);
207     }
208 }
209 unittest
210 {
211     static assert (_pdep_u32(0x1234_5678, 0x0F0F_0F0F) == 0x0506_0708);
212            assert (_pdep_u32(0x1234_5678, 0x0F0F_0F0F) == 0x0506_0708);
213 }
214 
215 /// Deposit contiguous low bits from unsigned 64-bit integer `a` to dst at the corresponding bit locations specified by `mask`; all other bits in dst are set to zero.
216 ulong _pdep_u64 (ulong a, ulong mask)
217 {
218     static if (BMI2_builtins)
219     {
220         if (!__ctfe)
221         {
222             version(X86_64)
223             {
224                 // This instruction not available in 32-bit x86.
225                 return __builtin_ia32_pdep_di(a, mask);
226             }
227             else
228                 return pdep!ulong(a, mask);
229         }
230         else
231             return pdep!ulong(a, mask);
232     }
233     else
234     {
235         return pdep!ulong(a, mask);
236     }
237 }
238 unittest
239 {
240     static assert (_pdep_u64(0x1234_5678_8765_4321, 0x0F0F_0F0F_0F0F_0F0F) == 0x0807_0605_0403_0201);
241            assert (_pdep_u64(0x1234_5678_8765_4321, 0x0F0F_0F0F_0F0F_0F0F) == 0x0807_0605_0403_0201);
242 }
243 
244 // Helper function for PDEP
245 private T pdep(T)(T a, T mask)
246 {
247     /+
248         tmp := a
249         dst := 0
250         m := 0
251         k := 0
252         DO WHILE m < 32
253             IF mask[m] == 1
254                 dst[m] := tmp[k]
255                 k := k + 1
256             FI
257             m := m + 1
258         OD
259     +/
260     T dst;
261     T k_bitpos = 1;
262     T m_bitpos = 1; // for each iteration, this has one bit set to 1 in the position probed
263     foreach (m; 0..T.sizeof*8)
264     {
265         if (mask & m_bitpos)
266         {
267             dst |= (a & k_bitpos) ? m_bitpos : 0;
268             k_bitpos <<= 1;
269         }
270         m_bitpos <<= 1;
271     }
272     return dst;
273 }
274 
275 
276 /// Extract bits from unsigned 32-bit integer `a` at the corresponding bit locations specified by `mask` to contiguous low bits in dst; the remaining upper bits in dst are set to zero.
277 uint _pext_u32 (uint a, uint mask)
278 {
279     static if (BMI2_builtins)
280     {
281         if (!__ctfe)
282             return __builtin_ia32_pext_si(a, mask);
283         else
284             return pext!uint(a, mask);
285     }
286     else
287     {
288         return pext!uint(a, mask);
289     }
290 }
291 unittest
292 {
293     static assert (_pext_u32(0x1234_5678, 0x0F0F_0F0F) == 0x2468);
294            assert (_pext_u32(0x1234_5678, 0x0F0F_0F0F) == 0x2468);
295 }
296 
297 /// Extract bits from unsigned 64-bit integer `a` at the corresponding bit locations specified by `mask` to contiguous low bits in dst; the remaining upper bits in dst are set to zero.
298 ulong _pext_u64 (ulong a, ulong mask)
299 {
300     static if (BMI2_builtins)
301     {
302         if (!__ctfe)
303         {
304             version(X86_64)
305             {
306                 // This instruction not available in 32-bit x86.
307                 return __builtin_ia32_pext_di(a, mask);
308             }
309             else
310                 return pext!ulong(a, mask);
311         }
312         else
313             return pext!ulong(a, mask);
314     }
315     else
316     {
317         return pext!ulong(a, mask);
318     }
319 }
320 unittest
321 {
322     static assert (_pext_u64(0x1234_5678_8765_4321, 0x0F0F_0F0F_0F0F_0F0F) == 0x2468_7531);
323            assert (_pext_u64(0x1234_5678_8765_4321, 0x0F0F_0F0F_0F0F_0F0F) == 0x2468_7531);
324 }
325 
326 // Helper function for PEXT
327 private T pext(T)(T a, T mask)
328 {
329     /+
330         tmp := a
331         dst := 0
332         m := 0
333         k := 0
334         DO WHILE m < number of bits in T
335             IF mask[m] == 1
336                 dst[k] := tmp[m]
337                 k := k + 1
338             FI
339             m := m + 1
340         OD
341     +/
342     T dst;
343     T k_bitpos = 1;
344     T m_bitpos = 1; // for each iteration, this has one bit set to 1 in the position probed
345     foreach (m; 0..T.sizeof*8)
346     {
347         if (mask & m_bitpos)
348         {
349             dst |= (a & m_bitpos) ? k_bitpos : 0;
350             k_bitpos <<= 1;
351         }
352         m_bitpos <<= 1;
353     }
354     return dst;
355 }