1 /** 2 * BMI2 intrinsics. 3 * https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#othertechs=BMI2 4 * 5 * Copyright: Copyright Johan Engelen 2021. 6 * License: $(LINK2 http://www.boost.org/LICENSE_1_0.txt, Boost License 1.0) 7 */ 8 module inteli.bmi2intrin; 9 10 import inteli.internals; 11 12 static if (LDC_with_BMI2 || GDC_with_BMI2) 13 { 14 private enum BMI2_builtins = true; 15 } 16 else 17 { 18 private enum BMI2_builtins = false; 19 } 20 21 nothrow @nogc pure @safe: 22 23 /// Copy all bits from unsigned 32-bit integer `a` to dst, and reset (set to 0) the high bits in dst starting at index. 24 uint _bzhi_u32 (uint a, uint index) 25 { 26 static if (BMI2_builtins) 27 { 28 if (!__ctfe) 29 return __builtin_ia32_bzhi_si(a, index); 30 else 31 return bzhi!uint(a, index); 32 } 33 else 34 { 35 return bzhi!uint(a, index); 36 } 37 } 38 unittest 39 { 40 static assert (_bzhi_u32(0x1234_5678, 5) == 0x18); 41 assert (_bzhi_u32(0x1234_5678, 5) == 0x18); 42 static assert (_bzhi_u32(0x1234_5678, 10) == 0x278); 43 assert (_bzhi_u32(0x1234_5678, 10) == 0x278); 44 static assert (_bzhi_u32(0x1234_5678, 21) == 0x14_5678); 45 assert (_bzhi_u32(0x1234_5678, 21) == 0x14_5678); 46 } 47 48 /// Copy all bits from unsigned 64-bit integer `a` to dst, and reset (set to 0) the high bits in dst starting at index. 49 ulong _bzhi_u64 (ulong a, uint index) 50 { 51 static if (BMI2_builtins) 52 { 53 if (!__ctfe) 54 return __builtin_ia32_bzhi_di(a, index); 55 else 56 return bzhi!ulong(a, index); 57 } 58 else 59 { 60 return bzhi!ulong(a, index); 61 } 62 } 63 unittest 64 { 65 static assert (_bzhi_u64(0x1234_5678, 5) == 0x18); 66 assert (_bzhi_u64(0x1234_5678, 5) == 0x18); 67 static assert (_bzhi_u64(0x1234_5678, 10) == 0x278); 68 assert (_bzhi_u64(0x1234_5678, 10) == 0x278); 69 static assert (_bzhi_u64(0x1234_5678, 21) == 0x14_5678); 70 assert (_bzhi_u64(0x1234_5678, 21) == 0x14_5678); 71 static assert (_bzhi_u64(0x8765_4321_1234_5678, 54) == 0x0025_4321_1234_5678); 72 assert (_bzhi_u64(0x8765_4321_1234_5678, 54) == 0x0025_4321_1234_5678); 73 } 74 75 // Helper function for BZHI 76 private T bzhi(T)(T a, uint index) 77 { 78 /+ 79 n := index[7:0] 80 dst := a 81 IF (n < number of bits) 82 dst[MSB:n] := 0 83 FI 84 +/ 85 enum numbits = T.sizeof*8; 86 T dst = a; 87 if (index < numbits) 88 { 89 T mask = (T(1) << index) - 1; 90 dst &= mask; 91 } 92 return dst; 93 } 94 95 /// Multiply unsigned 32-bit integers `a` and `b`, store the low 32-bits of the result in dst, and store the high 32-bits in `hi`. This does not read or write arithmetic flags. 96 /// TODO: the implementation _does_ set arithmetic flags, unless the x86 instruction mulx is indeed selected. 97 uint _mulx_u32 (uint a, uint b, uint* hi) 98 { 99 /+ 100 dst[31:0] := (a * b)[31:0] 101 MEM[hi+31:hi] := (a * b)[63:32] 102 +/ 103 ulong result = cast(ulong) a * b; 104 *hi = cast(uint) ((result >>> 32) & 0xFFFF_FFFF); 105 return cast(uint) (result & 0xFFFF_FFFF); 106 } 107 @system unittest 108 { 109 uint hi; 110 assert (_mulx_u32(0x1234_5678, 0x1234_5678, &hi) == 0x1DF4_D840); 111 assert (hi == 0x014B_66DC); 112 } 113 114 /// Multiply unsigned 64-bit integers `a` and `b`, store the low 64-bits of the result in dst, and store the high 64-bits in `hi`. This does not read or write arithmetic flags. 115 /// TODO: the implementation _does_ set arithmetic flags, unless the x86 instruction mulx is indeed selected. 116 ulong _mulx_u64 (ulong a, ulong b, ulong* hi) 117 { 118 /+ 119 dst[63:0] := (a * b)[63:0] 120 MEM[hi+63:hi] := (a * b)[127:64] 121 +/ 122 123 version(LDC) 124 { 125 // LDC x86: Generates mulx from -O0 126 enum ir = ` 127 %4 = zext i64 %0 to i128 128 %5 = zext i64 %1 to i128 129 %6 = mul nuw i128 %5, %4 130 %7 = lshr i128 %6, 64 131 %8 = trunc i128 %7 to i64 132 store i64 %8, i64* %2, align 8 133 %9 = trunc i128 %6 to i64 134 ret i64 %9`; 135 return LDCInlineIR!(ir, ulong, ulong, ulong, ulong*)(a, b, hi); 136 } 137 else 138 { 139 /+ Straight-forward implementation with `ucent`: 140 ucent result = cast(ucent) a * b; 141 *hi = cast(ulong) ((result >>> 64) & 0xFFFF_FFFF_FFFF_FFFF); 142 return cast(ulong) (result & 0xFFFF_FFFF_FFFF_FFFF); 143 +/ 144 145 /+ 146 Implementation using 64bit math is more complex... 147 a * b = (a_high << 32 + a_low) * (b_high << 32 + b_low) 148 = (a_high << 32)*(b_high << 32) + (a_high << 32)*b_low + a_low* (b_high << 32) + a_low*b_low 149 = (a_high*b_high) << 64 + (a_high*b_low) << 32 + (a_low*b_high) << 32 + a_low*b_low 150 = c2 << 64 + c11 << 32 + c12 << 32 + c0 151 = z1 << 64 + z0 152 // The sums may overflow, so we need to carry the carry (from low 64bits to high 64bits). We can do that 153 // by separately creating the sum to get the high 32 bits of z0 using 64bit math. The high 32 bits of that 154 // intermediate result is then the 'carry' that we need to add when calculating z1's sum. 155 z0 = (c0 & 0xFFFF_FFFF) + (c0 >> 32 + c11 & 0xFFFF_FFFF + c12 & 0xFFFF_FFFF ) << 32 156 The carry part from z0's sum = (c0 >> 32 + c11 & 0xFFFF_FFFF + c12 & 0xFFFF_FFFF ) >> 32 157 z1 = c2 + (c11 >> 32 + c12 >> 32 + (c0 >> 32 + c11 & 0xFFFF_FFFF + c12 & 0xFFFF_FFFF ) >> 32 158 +/ 159 160 const ulong a_low = a & 0xFFFF_FFFF; 161 const ulong a_high = a >>> 32; 162 const ulong b_low = b & 0xFFFF_FFFF; 163 const ulong b_high = b >>> 32; 164 165 const ulong c2 = a_high*b_high; 166 const ulong c11 = a_high*b_low; 167 const ulong c12 = a_low*b_high; 168 const ulong c0 = a_low*b_low; 169 170 const ulong common_term = (c0 >> 32) + (c11 & 0xFFFF_FFFF) + (c12 & 0xFFFF_FFFF); 171 const ulong z0 = (c0 & 0xFFFF_FFFF) + (common_term << 32); 172 const ulong z1 = c2 + (c11 >> 32) + (c12 >> 32) + (common_term >> 32); 173 174 *hi = z1; 175 return z0; 176 } 177 } 178 @system unittest 179 { 180 ulong hi; 181 // 0x1234_5678_9ABC_DEF0 * 0x1234_5678_9ABC_DEF0 == 0x14b_66dc_33f6_acdc_a5e2_0890_f2a5_2100 182 assert (_mulx_u64(0x1234_5678_9ABC_DEF0, 0x1234_5678_9ABC_DEF0, &hi) == 0xa5e2_0890_f2a5_2100); 183 assert (hi == 0x14b_66dc_33f6_acdc); 184 } 185 186 /// Deposit contiguous low bits from unsigned 32-bit integer `a` to dst at the corresponding bit locations specified by `mask`; all other bits in dst are set to zero. 187 uint _pdep_u32 (uint a, uint mask) 188 { 189 static if (BMI2_builtins) 190 { 191 if (!__ctfe) 192 return __builtin_ia32_pdep_si(a, mask); 193 else 194 return pdep!uint(a, mask); 195 } 196 else 197 { 198 return pdep!uint(a, mask); 199 } 200 } 201 unittest 202 { 203 static assert (_pdep_u32(0x1234_5678, 0x0F0F_0F0F) == 0x0506_0708); 204 assert (_pdep_u32(0x1234_5678, 0x0F0F_0F0F) == 0x0506_0708); 205 } 206 207 /// Deposit contiguous low bits from unsigned 64-bit integer `a` to dst at the corresponding bit locations specified by `mask`; all other bits in dst are set to zero. 208 ulong _pdep_u64 (ulong a, ulong mask) 209 { 210 static if (BMI2_builtins) 211 { 212 if (!__ctfe) 213 return __builtin_ia32_pdep_di(a, mask); 214 else 215 return pdep!ulong(a, mask); 216 } 217 else 218 { 219 return pdep!ulong(a, mask); 220 } 221 } 222 unittest 223 { 224 static assert (_pdep_u64(0x1234_5678_8765_4321, 0x0F0F_0F0F_0F0F_0F0F) == 0x0807_0605_0403_0201); 225 assert (_pdep_u64(0x1234_5678_8765_4321, 0x0F0F_0F0F_0F0F_0F0F) == 0x0807_0605_0403_0201); 226 } 227 228 // Helper function for PDEP 229 private T pdep(T)(T a, T mask) 230 { 231 /+ 232 tmp := a 233 dst := 0 234 m := 0 235 k := 0 236 DO WHILE m < 32 237 IF mask[m] == 1 238 dst[m] := tmp[k] 239 k := k + 1 240 FI 241 m := m + 1 242 OD 243 +/ 244 T dst; 245 T k_bitpos = 1; 246 T m_bitpos = 1; // for each iteration, this has one bit set to 1 in the position probed 247 foreach (m; 0..T.sizeof*8) 248 { 249 if (mask & m_bitpos) 250 { 251 dst |= (a & k_bitpos) ? m_bitpos : 0; 252 k_bitpos <<= 1; 253 } 254 m_bitpos <<= 1; 255 } 256 return dst; 257 } 258 259 260 /// Extract bits from unsigned 32-bit integer `a` at the corresponding bit locations specified by `mask` to contiguous low bits in dst; the remaining upper bits in dst are set to zero. 261 uint _pext_u32 (uint a, uint mask) 262 { 263 static if (BMI2_builtins) 264 { 265 if (!__ctfe) 266 return __builtin_ia32_pext_si(a, mask); 267 else 268 return pext!uint(a, mask); 269 } 270 else 271 { 272 return pext!uint(a, mask); 273 } 274 } 275 unittest 276 { 277 static assert (_pext_u32(0x1234_5678, 0x0F0F_0F0F) == 0x2468); 278 assert (_pext_u32(0x1234_5678, 0x0F0F_0F0F) == 0x2468); 279 } 280 281 /// Extract bits from unsigned 64-bit integer `a` at the corresponding bit locations specified by `mask` to contiguous low bits in dst; the remaining upper bits in dst are set to zero. 282 ulong _pext_u64 (ulong a, ulong mask) 283 { 284 static if (BMI2_builtins) 285 { 286 if (!__ctfe) 287 return __builtin_ia32_pext_di(a, mask); 288 else 289 return pext!ulong(a, mask); 290 } 291 else 292 { 293 return pext!ulong(a, mask); 294 }} 295 unittest 296 { 297 static assert (_pext_u64(0x1234_5678_8765_4321, 0x0F0F_0F0F_0F0F_0F0F) == 0x2468_7531); 298 assert (_pext_u64(0x1234_5678_8765_4321, 0x0F0F_0F0F_0F0F_0F0F) == 0x2468_7531); 299 } 300 301 // Helper function for PEXT 302 private T pext(T)(T a, T mask) 303 { 304 /+ 305 tmp := a 306 dst := 0 307 m := 0 308 k := 0 309 DO WHILE m < number of bits in T 310 IF mask[m] == 1 311 dst[k] := tmp[m] 312 k := k + 1 313 FI 314 m := m + 1 315 OD 316 +/ 317 T dst; 318 T k_bitpos = 1; 319 T m_bitpos = 1; // for each iteration, this has one bit set to 1 in the position probed 320 foreach (m; 0..T.sizeof*8) 321 { 322 if (mask & m_bitpos) 323 { 324 dst |= (a & m_bitpos) ? k_bitpos : 0; 325 k_bitpos <<= 1; 326 } 327 m_bitpos <<= 1; 328 } 329 return dst; 330 }