1 /**
2 * BMI2 intrinsics.
3 * https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#othertechs=BMI2
4 *
5 * Copyright: Copyright Johan Engelen 2021.
6 * License:   $(LINK2 http://www.boost.org/LICENSE_1_0.txt, Boost License 1.0)
7 */
8 module inteli.bmi2intrin;
9 
10 import inteli.internals;
11 
12 static if (LDC_with_BMI2 || GDC_with_BMI2)
13 {
14     private enum BMI2_builtins = true;
15 }
16 else
17 {
18     private enum BMI2_builtins = false;
19 }
20 
21 nothrow @nogc pure @safe:
22 
23 /// Copy all bits from unsigned 32-bit integer `a` to dst, and reset (set to 0) the high bits in dst starting at index.
24 uint _bzhi_u32 (uint a, uint index)
25 {
26     static if (BMI2_builtins)
27     {
28         if (!__ctfe)
29             return __builtin_ia32_bzhi_si(a, index);
30         else
31             return bzhi!uint(a, index);
32     }
33     else
34     {
35         return bzhi!uint(a, index);
36     }
37 }
38 unittest
39 {
40     static assert (_bzhi_u32(0x1234_5678, 5) == 0x18);
41            assert (_bzhi_u32(0x1234_5678, 5) == 0x18);
42     static assert (_bzhi_u32(0x1234_5678, 10) == 0x278);
43            assert (_bzhi_u32(0x1234_5678, 10) == 0x278);
44     static assert (_bzhi_u32(0x1234_5678, 21) == 0x14_5678);
45            assert (_bzhi_u32(0x1234_5678, 21) == 0x14_5678);
46 }
47 
48 /// Copy all bits from unsigned 64-bit integer `a` to dst, and reset (set to 0) the high bits in dst starting at index.
49 ulong _bzhi_u64 (ulong a, uint index)
50 {
51     static if (BMI2_builtins)
52     {
53         if (!__ctfe)
54             return __builtin_ia32_bzhi_di(a, index);
55         else
56             return bzhi!ulong(a, index);
57     }
58     else
59     {
60         return bzhi!ulong(a, index);
61     }
62 }
63 unittest
64 {
65     static assert (_bzhi_u64(0x1234_5678, 5) == 0x18);
66            assert (_bzhi_u64(0x1234_5678, 5) == 0x18);
67     static assert (_bzhi_u64(0x1234_5678, 10) == 0x278);
68            assert (_bzhi_u64(0x1234_5678, 10) == 0x278);
69     static assert (_bzhi_u64(0x1234_5678, 21) == 0x14_5678);
70            assert (_bzhi_u64(0x1234_5678, 21) == 0x14_5678);
71     static assert (_bzhi_u64(0x8765_4321_1234_5678, 54) == 0x0025_4321_1234_5678);
72            assert (_bzhi_u64(0x8765_4321_1234_5678, 54) == 0x0025_4321_1234_5678);
73 }
74 
75 // Helper function for BZHI
76 private T bzhi(T)(T a, uint index)
77 {
78     /+
79         n := index[7:0]
80         dst := a
81         IF (n < number of bits)
82             dst[MSB:n] := 0
83         FI
84     +/
85     enum numbits = T.sizeof*8;
86     T dst = a;
87     if (index < numbits)
88     {
89         T mask = (T(1) << index) - 1;
90         dst &= mask;
91     }
92     return dst;
93 }
94 
95 /// Multiply unsigned 32-bit integers `a` and `b`, store the low 32-bits of the result in dst, and store the high 32-bits in `hi`. This does not read or write arithmetic flags.
96 /// TODO: the implementation _does_ set arithmetic flags, unless the x86 instruction mulx is indeed selected.
97 uint _mulx_u32 (uint a, uint b, uint* hi)
98 {
99     /+
100         dst[31:0] := (a * b)[31:0]
101         MEM[hi+31:hi] := (a * b)[63:32]
102     +/
103     ulong result = cast(ulong) a * b;
104     *hi = cast(uint) ((result >>> 32) & 0xFFFF_FFFF);
105     return cast(uint) (result & 0xFFFF_FFFF);
106 }
107 @system unittest
108 {
109     uint hi;
110     assert (_mulx_u32(0x1234_5678, 0x1234_5678, &hi) == 0x1DF4_D840);
111     assert (hi == 0x014B_66DC);
112 }
113 
114 /// Multiply unsigned 64-bit integers `a` and `b`, store the low 64-bits of the result in dst, and store the high 64-bits in `hi`. This does not read or write arithmetic flags.
115 /// TODO: the implementation _does_ set arithmetic flags, unless the x86 instruction mulx is indeed selected.
116 ulong _mulx_u64 (ulong a, ulong b, ulong* hi)
117 {
118     /+
119         dst[63:0] := (a * b)[63:0]
120         MEM[hi+63:hi]  := (a * b)[127:64]
121     +/
122 
123     version(LDC)
124     {
125         // LDC x86: Generates mulx from -O0
126         enum ir = `
127             %4 = zext i64 %0 to i128
128             %5 = zext i64 %1 to i128
129             %6 = mul nuw i128 %5, %4
130             %7 = lshr i128 %6, 64
131             %8 = trunc i128 %7 to i64
132             store i64 %8, i64* %2, align 8
133             %9 = trunc i128 %6 to i64
134             ret i64 %9`;
135         return LDCInlineIR!(ir, ulong, ulong, ulong, ulong*)(a, b, hi);
136     }
137     else
138     {
139         /+ Straight-forward implementation with `ucent`:
140         ucent result = cast(ucent) a * b;
141         *hi = cast(ulong) ((result >>> 64) & 0xFFFF_FFFF_FFFF_FFFF);
142         return cast(ulong) (result & 0xFFFF_FFFF_FFFF_FFFF);
143         +/
144 
145         /+
146             Implementation using 64bit math is more complex...
147             a * b = (a_high << 32 + a_low) * (b_high << 32 + b_low)
148                   = (a_high << 32)*(b_high << 32) + (a_high << 32)*b_low + a_low* (b_high << 32) + a_low*b_low
149                   = (a_high*b_high) << 64 + (a_high*b_low) << 32 + (a_low*b_high) << 32 + a_low*b_low
150                   = c2 << 64 + c11 << 32 + c12 << 32 + c0
151                   = z1 << 64  +  z0
152         // The sums may overflow, so we need to carry the carry (from low 64bits to high 64bits). We can do that
153         // by separately creating the sum to get the high 32 bits of z0 using 64bit math. The high 32 bits of that
154         // intermediate result is then the 'carry' that we need to add when calculating z1's sum.
155             z0 = (c0 & 0xFFFF_FFFF) + (c0 >> 32 + c11 & 0xFFFF_FFFF + c12 & 0xFFFF_FFFF ) << 32
156         The carry part from z0's sum = (c0 >> 32 + c11 & 0xFFFF_FFFF + c12 & 0xFFFF_FFFF ) >> 32
157             z1 = c2 + (c11 >> 32 + c12 >> 32 + (c0 >> 32 + c11 & 0xFFFF_FFFF + c12 & 0xFFFF_FFFF ) >> 32
158         +/
159 
160         const ulong a_low = a & 0xFFFF_FFFF;
161         const ulong a_high = a >>> 32;
162         const ulong b_low = b & 0xFFFF_FFFF;
163         const ulong b_high = b >>> 32;
164 
165         const ulong c2 = a_high*b_high;
166         const ulong c11 = a_high*b_low;
167         const ulong c12 = a_low*b_high;
168         const ulong c0 = a_low*b_low;
169 
170         const ulong common_term = (c0 >> 32) + (c11 & 0xFFFF_FFFF) + (c12 & 0xFFFF_FFFF);
171         const ulong z0 = (c0 & 0xFFFF_FFFF) + (common_term << 32);
172         const ulong z1 = c2 + (c11 >> 32) + (c12 >> 32) + (common_term >> 32);
173 
174         *hi = z1;
175         return z0;
176     }
177 }
178 @system unittest
179 {
180     ulong hi;
181     // 0x1234_5678_9ABC_DEF0 * 0x1234_5678_9ABC_DEF0 == 0x14b_66dc_33f6_acdc_a5e2_0890_f2a5_2100
182     assert (_mulx_u64(0x1234_5678_9ABC_DEF0, 0x1234_5678_9ABC_DEF0, &hi) == 0xa5e2_0890_f2a5_2100);
183     assert (hi == 0x14b_66dc_33f6_acdc);
184 }
185 
186 /// Deposit contiguous low bits from unsigned 32-bit integer `a` to dst at the corresponding bit locations specified by `mask`; all other bits in dst are set to zero.
187 uint _pdep_u32 (uint a, uint mask)
188 {
189     static if (BMI2_builtins)
190     {
191         if (!__ctfe)
192             return __builtin_ia32_pdep_si(a, mask);
193         else
194             return pdep!uint(a, mask);
195     }
196     else
197     {
198         return pdep!uint(a, mask);
199     }
200 }
201 unittest
202 {
203     static assert (_pdep_u32(0x1234_5678, 0x0F0F_0F0F) == 0x0506_0708);
204            assert (_pdep_u32(0x1234_5678, 0x0F0F_0F0F) == 0x0506_0708);
205 }
206 
207 /// Deposit contiguous low bits from unsigned 64-bit integer `a` to dst at the corresponding bit locations specified by `mask`; all other bits in dst are set to zero.
208 ulong _pdep_u64 (ulong a, ulong mask)
209 {
210     static if (BMI2_builtins)
211     {
212         if (!__ctfe)
213             return __builtin_ia32_pdep_di(a, mask);
214         else
215             return pdep!ulong(a, mask);
216     }
217     else
218     {
219         return pdep!ulong(a, mask);
220     }
221 }
222 unittest
223 {
224     static assert (_pdep_u64(0x1234_5678_8765_4321, 0x0F0F_0F0F_0F0F_0F0F) == 0x0807_0605_0403_0201);
225            assert (_pdep_u64(0x1234_5678_8765_4321, 0x0F0F_0F0F_0F0F_0F0F) == 0x0807_0605_0403_0201);
226 }
227 
228 // Helper function for PDEP
229 private T pdep(T)(T a, T mask)
230 {
231     /+
232         tmp := a
233         dst := 0
234         m := 0
235         k := 0
236         DO WHILE m < 32
237             IF mask[m] == 1
238                 dst[m] := tmp[k]
239                 k := k + 1
240             FI
241             m := m + 1
242         OD
243     +/
244     T dst;
245     T k_bitpos = 1;
246     T m_bitpos = 1; // for each iteration, this has one bit set to 1 in the position probed
247     foreach (m; 0..T.sizeof*8)
248     {
249         if (mask & m_bitpos)
250         {
251             dst |= (a & k_bitpos) ? m_bitpos : 0;
252             k_bitpos <<= 1;
253         }
254         m_bitpos <<= 1;
255     }
256     return dst;
257 }
258 
259 
260 /// Extract bits from unsigned 32-bit integer `a` at the corresponding bit locations specified by `mask` to contiguous low bits in dst; the remaining upper bits in dst are set to zero.
261 uint _pext_u32 (uint a, uint mask)
262 {
263     static if (BMI2_builtins)
264     {
265         if (!__ctfe)
266             return __builtin_ia32_pext_si(a, mask);
267         else
268             return pext!uint(a, mask);
269     }
270     else
271     {
272         return pext!uint(a, mask);
273     }
274 }
275 unittest
276 {
277     static assert (_pext_u32(0x1234_5678, 0x0F0F_0F0F) == 0x2468);
278            assert (_pext_u32(0x1234_5678, 0x0F0F_0F0F) == 0x2468);
279 }
280 
281 /// Extract bits from unsigned 64-bit integer `a` at the corresponding bit locations specified by `mask` to contiguous low bits in dst; the remaining upper bits in dst are set to zero.
282 ulong _pext_u64 (ulong a, ulong mask)
283 {
284     static if (BMI2_builtins)
285     {
286         if (!__ctfe)
287             return __builtin_ia32_pext_di(a, mask);
288         else
289             return pext!ulong(a, mask);
290     }
291     else
292     {
293         return pext!ulong(a, mask);
294     }}
295 unittest
296 {
297     static assert (_pext_u64(0x1234_5678_8765_4321, 0x0F0F_0F0F_0F0F_0F0F) == 0x2468_7531);
298            assert (_pext_u64(0x1234_5678_8765_4321, 0x0F0F_0F0F_0F0F_0F0F) == 0x2468_7531);
299 }
300 
301 // Helper function for PEXT
302 private T pext(T)(T a, T mask)
303 {
304     /+
305         tmp := a
306         dst := 0
307         m := 0
308         k := 0
309         DO WHILE m < number of bits in T
310             IF mask[m] == 1
311                 dst[k] := tmp[m]
312                 k := k + 1
313             FI
314             m := m + 1
315         OD
316     +/
317     T dst;
318     T k_bitpos = 1;
319     T m_bitpos = 1; // for each iteration, this has one bit set to 1 in the position probed
320     foreach (m; 0..T.sizeof*8)
321     {
322         if (mask & m_bitpos)
323         {
324             dst |= (a & m_bitpos) ? k_bitpos : 0;
325             k_bitpos <<= 1;
326         }
327         m_bitpos <<= 1;
328     }
329     return dst;
330 }