1 /**
2 * BMI2 intrinsics.
3 * https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#othertechs=BMI2
4 *
5 * Copyright: Copyright Johan Engelen 2021.
6 * License:   $(LINK2 http://www.boost.org/LICENSE_1_0.txt, Boost License 1.0)
7 */
8 module inteli.bmi2intrin;
9 
10 import inteli.internals;
11 
12 nothrow @nogc pure @safe:
13 
14 /// Copy all bits from unsigned 32-bit integer `a` to dst, and reset (set to 0) the high bits in dst starting at index.
15 uint _bzhi_u32 (uint a, uint index)
16 {
17     static if (GDC_or_LDC_with_BMI2)
18     {
19         if (!__ctfe)
20             return __builtin_ia32_bzhi_si(a, index);
21         else
22             return bzhi!uint(a, index);
23     }
24     else
25     {
26         return bzhi!uint(a, index);
27     }
28 }
29 unittest
30 {
31     static assert (_bzhi_u32(0x1234_5678, 5) == 0x18);
32            assert (_bzhi_u32(0x1234_5678, 5) == 0x18);
33     static assert (_bzhi_u32(0x1234_5678, 10) == 0x278);
34            assert (_bzhi_u32(0x1234_5678, 10) == 0x278);
35     static assert (_bzhi_u32(0x1234_5678, 21) == 0x14_5678);
36            assert (_bzhi_u32(0x1234_5678, 21) == 0x14_5678);
37 }
38 
39 /// Copy all bits from unsigned 64-bit integer `a` to dst, and reset (set to 0) the high bits in dst starting at index.
40 ulong _bzhi_u64 (ulong a, uint index)
41 {
42     static if (GDC_or_LDC_with_BMI2)
43     {
44         if (!__ctfe)
45         {
46             version(X86_64)
47             {
48                 // This instruction not available in 32-bit x86.
49                 return __builtin_ia32_bzhi_di(a, index);
50             }
51             else
52                 return bzhi!ulong(a, index);
53         }
54         else
55             return bzhi!ulong(a, index);
56     }
57     else
58     {
59         return bzhi!ulong(a, index);
60     }
61 }
62 unittest
63 {
64     static assert (_bzhi_u64(0x1234_5678, 5) == 0x18);
65            assert (_bzhi_u64(0x1234_5678, 5) == 0x18);
66     static assert (_bzhi_u64(0x1234_5678, 10) == 0x278);
67            assert (_bzhi_u64(0x1234_5678, 10) == 0x278);
68     static assert (_bzhi_u64(0x1234_5678, 21) == 0x14_5678);
69            assert (_bzhi_u64(0x1234_5678, 21) == 0x14_5678);
70     static assert (_bzhi_u64(0x8765_4321_1234_5678, 54) == 0x0025_4321_1234_5678);
71            assert (_bzhi_u64(0x8765_4321_1234_5678, 54) == 0x0025_4321_1234_5678);
72 }
73 
74 // Helper function for BZHI
75 private T bzhi(T)(T a, uint index)
76 {
77     /+
78         n := index[7:0]
79         dst := a
80         IF (n < number of bits)
81             dst[MSB:n] := 0
82         FI
83     +/
84     enum numbits = T.sizeof*8;
85     T dst = a;
86     if (index < numbits)
87     {
88         T mask = (T(1) << index) - 1;
89         dst &= mask;
90     }
91     return dst;
92 }
93 
94 /// Multiply unsigned 32-bit integers `a` and `b`, store the low 32-bits of the result in dst, 
95 /// and store the high 32-bits in `hi`. This does not read or write arithmetic flags.
96 /// Note: the implementation _does_ set arithmetic flags, unlike the instruction semantics say.
97 ///       But, those particular semantics don't exist at the level of intrinsics.
98 uint _mulx_u32 (uint a, uint b, uint* hi)
99 {
100     // Note: that does NOT generate mulx with LDC, and there seems to be no way to do that for
101     // some reason, even with LLVM IR.
102     // Also same with GDC.
103     ulong result = cast(ulong) a * b;
104     *hi = cast(uint) (result >>> 32);
105     return cast(uint)result;
106 }
107 @system unittest
108 {
109     uint hi;
110     assert (_mulx_u32(0x1234_5678, 0x1234_5678, &hi) == 0x1DF4_D840);
111     assert (hi == 0x014B_66DC);
112 }
113 
114 /// Multiply unsigned 64-bit integers `a` and `b`, store the low 64-bits of the result in dst, and 
115 /// store the high 64-bits in `hi`. This does not read or write arithmetic flags.
116 /// Note: the implementation _does_ set arithmetic flags, unlike the instruction semantics say.
117 ///       But, those particular semantics don't exist at the level of intrinsics.
118 ulong _mulx_u64 (ulong a, ulong b, ulong* hi)
119 {
120     /+
121         dst[63:0] := (a * b)[63:0]
122         MEM[hi+63:hi]  := (a * b)[127:64]
123     +/
124 
125     version(LDC)
126     {
127         // LDC x86: Generates mulx from -O0
128         enum ir = `
129             %4 = zext i64 %0 to i128
130             %5 = zext i64 %1 to i128
131             %6 = mul nuw i128 %5, %4
132             %7 = lshr i128 %6, 64
133             %8 = trunc i128 %7 to i64
134             store i64 %8, i64* %2, align 8
135             %9 = trunc i128 %6 to i64
136             ret i64 %9`;
137         return LDCInlineIR!(ir, ulong, ulong, ulong, ulong*)(a, b, hi);
138     }
139     else
140     {
141         /+ Straight-forward implementation with `ucent`:
142         ucent result = cast(ucent) a * b;
143         *hi = cast(ulong) ((result >>> 64) & 0xFFFF_FFFF_FFFF_FFFF);
144         return cast(ulong) (result & 0xFFFF_FFFF_FFFF_FFFF);
145         +/
146 
147         /+
148             Implementation using 64bit math is more complex...
149             a * b = (a_high << 32 + a_low) * (b_high << 32 + b_low)
150                   = (a_high << 32)*(b_high << 32) + (a_high << 32)*b_low + a_low* (b_high << 32) + a_low*b_low
151                   = (a_high*b_high) << 64 + (a_high*b_low) << 32 + (a_low*b_high) << 32 + a_low*b_low
152                   = c2 << 64 + c11 << 32 + c12 << 32 + c0
153                   = z1 << 64  +  z0
154         // The sums may overflow, so we need to carry the carry (from low 64bits to high 64bits). We can do that
155         // by separately creating the sum to get the high 32 bits of z0 using 64bit math. The high 32 bits of that
156         // intermediate result is then the 'carry' that we need to add when calculating z1's sum.
157             z0 = (c0 & 0xFFFF_FFFF) + (c0 >> 32 + c11 & 0xFFFF_FFFF + c12 & 0xFFFF_FFFF ) << 32
158         The carry part from z0's sum = (c0 >> 32 + c11 & 0xFFFF_FFFF + c12 & 0xFFFF_FFFF ) >> 32
159             z1 = c2 + (c11 >> 32 + c12 >> 32 + (c0 >> 32 + c11 & 0xFFFF_FFFF + c12 & 0xFFFF_FFFF ) >> 32
160         +/
161 
162         const ulong a_low = a & 0xFFFF_FFFF;
163         const ulong a_high = a >>> 32;
164         const ulong b_low = b & 0xFFFF_FFFF;
165         const ulong b_high = b >>> 32;
166 
167         const ulong c2 = a_high*b_high;
168         const ulong c11 = a_high*b_low;
169         const ulong c12 = a_low*b_high;
170         const ulong c0 = a_low*b_low;
171 
172         const ulong common_term = (c0 >> 32) + (c11 & 0xFFFF_FFFF) + (c12 & 0xFFFF_FFFF);
173         const ulong z0 = (c0 & 0xFFFF_FFFF) + (common_term << 32);
174         const ulong z1 = c2 + (c11 >> 32) + (c12 >> 32) + (common_term >> 32);
175 
176         *hi = z1;
177         return z0;
178     }
179 }
180 @system unittest
181 {
182     ulong hi;
183     // 0x1234_5678_9ABC_DEF0 * 0x1234_5678_9ABC_DEF0 == 0x14b_66dc_33f6_acdc_a5e2_0890_f2a5_2100
184     assert (_mulx_u64(0x1234_5678_9ABC_DEF0, 0x1234_5678_9ABC_DEF0, &hi) == 0xa5e2_0890_f2a5_2100);
185     assert (hi == 0x14b_66dc_33f6_acdc);
186 }
187 
188 /// Deposit contiguous low bits from unsigned 32-bit integer `a` to dst at the corresponding bit locations specified by `mask`; all other bits in dst are set to zero.
189 uint _pdep_u32 (uint a, uint mask)
190 {
191     static if (GDC_or_LDC_with_BMI2)
192     {
193         if (!__ctfe)
194             return __builtin_ia32_pdep_si(a, mask);
195         else
196             return pdep!uint(a, mask);
197     }
198     else
199     {
200         return pdep!uint(a, mask);
201     }
202 }
203 unittest
204 {
205     static assert (_pdep_u32(0x1234_5678, 0x0F0F_0F0F) == 0x0506_0708);
206            assert (_pdep_u32(0x1234_5678, 0x0F0F_0F0F) == 0x0506_0708);
207 }
208 
209 /// Deposit contiguous low bits from unsigned 64-bit integer `a` to dst at the corresponding bit locations specified by `mask`; all other bits in dst are set to zero.
210 ulong _pdep_u64 (ulong a, ulong mask)
211 {
212     static if (GDC_or_LDC_with_BMI2)
213     {
214         if (!__ctfe)
215         {
216             version(X86_64)
217             {
218                 // This instruction not available in 32-bit x86.
219                 return __builtin_ia32_pdep_di(a, mask);
220             }
221             else
222                 return pdep!ulong(a, mask);
223         }
224         else
225             return pdep!ulong(a, mask);
226     }
227     else
228     {
229         return pdep!ulong(a, mask);
230     }
231 }
232 unittest
233 {
234     static assert (_pdep_u64(0x1234_5678_8765_4321, 0x0F0F_0F0F_0F0F_0F0F) == 0x0807_0605_0403_0201);
235            assert (_pdep_u64(0x1234_5678_8765_4321, 0x0F0F_0F0F_0F0F_0F0F) == 0x0807_0605_0403_0201);
236 }
237 
238 // Helper function for PDEP
239 private T pdep(T)(T a, T mask)
240 {
241     /+
242         tmp := a
243         dst := 0
244         m := 0
245         k := 0
246         DO WHILE m < 32
247             IF mask[m] == 1
248                 dst[m] := tmp[k]
249                 k := k + 1
250             FI
251             m := m + 1
252         OD
253     +/
254     T dst;
255     T k_bitpos = 1;
256     T m_bitpos = 1; // for each iteration, this has one bit set to 1 in the position probed
257     foreach (m; 0..T.sizeof*8)
258     {
259         if (mask & m_bitpos)
260         {
261             dst |= (a & k_bitpos) ? m_bitpos : 0;
262             k_bitpos <<= 1;
263         }
264         m_bitpos <<= 1;
265     }
266     return dst;
267 }
268 
269 
270 /// Extract bits from unsigned 32-bit integer `a` at the corresponding bit locations specified by 
271 /// `mask` to contiguous low bits in dst; the remaining upper bits in dst are set to zero.
272 uint _pext_u32 (uint a, uint mask)
273 {
274     static if (GDC_or_LDC_with_BMI2)
275     {
276         if (!__ctfe)
277             return __builtin_ia32_pext_si(a, mask);
278         else
279             return pext!uint(a, mask);
280     }
281     else
282     {
283         return pext!uint(a, mask);
284     }
285 }
286 unittest
287 {
288     static assert (_pext_u32(0x1234_5678, 0x0F0F_0F0F) == 0x2468);
289            assert (_pext_u32(0x1234_5678, 0x0F0F_0F0F) == 0x2468);
290 }
291 
292 /// Extract bits from unsigned 64-bit integer `a` at the corresponding bit locations specified by 
293 /// `mask` to contiguous low bits in dst; the remaining upper bits in dst are set to zero.
294 ulong _pext_u64 (ulong a, ulong mask)
295 {
296     static if (GDC_or_LDC_with_BMI2)
297     {
298         if (!__ctfe)
299         {
300             version(X86_64)
301             {
302                 // This instruction not available in 32-bit x86.
303                 return __builtin_ia32_pext_di(a, mask);
304             }
305             else
306                 return pext!ulong(a, mask);
307         }
308         else
309             return pext!ulong(a, mask);
310     }
311     else
312     {
313         return pext!ulong(a, mask);
314     }
315 }
316 unittest
317 {
318     static assert (_pext_u64(0x1234_5678_8765_4321, 0x0F0F_0F0F_0F0F_0F0F) == 0x2468_7531);
319            assert (_pext_u64(0x1234_5678_8765_4321, 0x0F0F_0F0F_0F0F_0F0F) == 0x2468_7531);
320 }
321 
322 // Helper function for PEXT
323 private T pext(T)(T a, T mask)
324 {
325     /+
326         tmp := a
327         dst := 0
328         m := 0
329         k := 0
330         DO WHILE m < number of bits in T
331             IF mask[m] == 1
332                 dst[k] := tmp[m]
333                 k := k + 1
334             FI
335             m := m + 1
336         OD
337     +/
338     T dst;
339     T k_bitpos = 1;
340     T m_bitpos = 1; // for each iteration, this has one bit set to 1 in the position probed
341     foreach (m; 0..T.sizeof*8)
342     {
343         if (mask & m_bitpos)
344         {
345             dst |= (a & m_bitpos) ? k_bitpos : 0;
346             k_bitpos <<= 1;
347         }
348         m_bitpos <<= 1;
349     }
350     return dst;
351 }