1 /** 2 * BMI2 intrinsics. 3 * https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#othertechs=BMI2 4 * 5 * Copyright: Copyright Johan Engelen 2021. 6 * License: $(LINK2 http://www.boost.org/LICENSE_1_0.txt, Boost License 1.0) 7 */ 8 module inteli.bmi2intrin; 9 10 import inteli.internals; 11 12 static if (LDC_with_BMI2 || GDC_with_BMI2) 13 { 14 private enum BMI2_builtins = true; 15 } 16 else 17 { 18 private enum BMI2_builtins = false; 19 } 20 21 nothrow @nogc pure @safe: 22 23 /// Copy all bits from unsigned 32-bit integer `a` to dst, and reset (set to 0) the high bits in dst starting at index. 24 uint _bzhi_u32 (uint a, uint index) 25 { 26 static if (BMI2_builtins) 27 { 28 if (!__ctfe) 29 return __builtin_ia32_bzhi_si(a, index); 30 else 31 return bzhi!uint(a, index); 32 } 33 else 34 { 35 return bzhi!uint(a, index); 36 } 37 } 38 unittest 39 { 40 static assert (_bzhi_u32(0x1234_5678, 5) == 0x18); 41 assert (_bzhi_u32(0x1234_5678, 5) == 0x18); 42 static assert (_bzhi_u32(0x1234_5678, 10) == 0x278); 43 assert (_bzhi_u32(0x1234_5678, 10) == 0x278); 44 static assert (_bzhi_u32(0x1234_5678, 21) == 0x14_5678); 45 assert (_bzhi_u32(0x1234_5678, 21) == 0x14_5678); 46 } 47 48 /// Copy all bits from unsigned 64-bit integer `a` to dst, and reset (set to 0) the high bits in dst starting at index. 49 ulong _bzhi_u64 (ulong a, uint index) 50 { 51 static if (BMI2_builtins) 52 { 53 if (!__ctfe) 54 { 55 version(X86_64) 56 { 57 // This instruction not available in 32-bit x86. 58 return __builtin_ia32_bzhi_di(a, index); 59 } 60 else 61 return bzhi!ulong(a, index); 62 } 63 else 64 return bzhi!ulong(a, index); 65 } 66 else 67 { 68 return bzhi!ulong(a, index); 69 } 70 } 71 unittest 72 { 73 static assert (_bzhi_u64(0x1234_5678, 5) == 0x18); 74 assert (_bzhi_u64(0x1234_5678, 5) == 0x18); 75 static assert (_bzhi_u64(0x1234_5678, 10) == 0x278); 76 assert (_bzhi_u64(0x1234_5678, 10) == 0x278); 77 static assert (_bzhi_u64(0x1234_5678, 21) == 0x14_5678); 78 assert (_bzhi_u64(0x1234_5678, 21) == 0x14_5678); 79 static assert (_bzhi_u64(0x8765_4321_1234_5678, 54) == 0x0025_4321_1234_5678); 80 assert (_bzhi_u64(0x8765_4321_1234_5678, 54) == 0x0025_4321_1234_5678); 81 } 82 83 // Helper function for BZHI 84 private T bzhi(T)(T a, uint index) 85 { 86 /+ 87 n := index[7:0] 88 dst := a 89 IF (n < number of bits) 90 dst[MSB:n] := 0 91 FI 92 +/ 93 enum numbits = T.sizeof*8; 94 T dst = a; 95 if (index < numbits) 96 { 97 T mask = (T(1) << index) - 1; 98 dst &= mask; 99 } 100 return dst; 101 } 102 103 /// Multiply unsigned 32-bit integers `a` and `b`, store the low 32-bits of the result in dst, and store the high 32-bits in `hi`. This does not read or write arithmetic flags. 104 /// TODO: the implementation _does_ set arithmetic flags, unless the x86 instruction mulx is indeed selected. 105 uint _mulx_u32 (uint a, uint b, uint* hi) 106 { 107 /+ 108 dst[31:0] := (a * b)[31:0] 109 MEM[hi+31:hi] := (a * b)[63:32] 110 +/ 111 ulong result = cast(ulong) a * b; 112 *hi = cast(uint) ((result >>> 32) & 0xFFFF_FFFF); 113 return cast(uint) (result & 0xFFFF_FFFF); 114 } 115 @system unittest 116 { 117 uint hi; 118 assert (_mulx_u32(0x1234_5678, 0x1234_5678, &hi) == 0x1DF4_D840); 119 assert (hi == 0x014B_66DC); 120 } 121 122 /// Multiply unsigned 64-bit integers `a` and `b`, store the low 64-bits of the result in dst, and store the high 64-bits in `hi`. This does not read or write arithmetic flags. 123 /// TODO: the implementation _does_ set arithmetic flags, unless the x86 instruction mulx is indeed selected. 124 ulong _mulx_u64 (ulong a, ulong b, ulong* hi) 125 { 126 /+ 127 dst[63:0] := (a * b)[63:0] 128 MEM[hi+63:hi] := (a * b)[127:64] 129 +/ 130 131 version(LDC) 132 { 133 // LDC x86: Generates mulx from -O0 134 enum ir = ` 135 %4 = zext i64 %0 to i128 136 %5 = zext i64 %1 to i128 137 %6 = mul nuw i128 %5, %4 138 %7 = lshr i128 %6, 64 139 %8 = trunc i128 %7 to i64 140 store i64 %8, i64* %2, align 8 141 %9 = trunc i128 %6 to i64 142 ret i64 %9`; 143 return LDCInlineIR!(ir, ulong, ulong, ulong, ulong*)(a, b, hi); 144 } 145 else 146 { 147 /+ Straight-forward implementation with `ucent`: 148 ucent result = cast(ucent) a * b; 149 *hi = cast(ulong) ((result >>> 64) & 0xFFFF_FFFF_FFFF_FFFF); 150 return cast(ulong) (result & 0xFFFF_FFFF_FFFF_FFFF); 151 +/ 152 153 /+ 154 Implementation using 64bit math is more complex... 155 a * b = (a_high << 32 + a_low) * (b_high << 32 + b_low) 156 = (a_high << 32)*(b_high << 32) + (a_high << 32)*b_low + a_low* (b_high << 32) + a_low*b_low 157 = (a_high*b_high) << 64 + (a_high*b_low) << 32 + (a_low*b_high) << 32 + a_low*b_low 158 = c2 << 64 + c11 << 32 + c12 << 32 + c0 159 = z1 << 64 + z0 160 // The sums may overflow, so we need to carry the carry (from low 64bits to high 64bits). We can do that 161 // by separately creating the sum to get the high 32 bits of z0 using 64bit math. The high 32 bits of that 162 // intermediate result is then the 'carry' that we need to add when calculating z1's sum. 163 z0 = (c0 & 0xFFFF_FFFF) + (c0 >> 32 + c11 & 0xFFFF_FFFF + c12 & 0xFFFF_FFFF ) << 32 164 The carry part from z0's sum = (c0 >> 32 + c11 & 0xFFFF_FFFF + c12 & 0xFFFF_FFFF ) >> 32 165 z1 = c2 + (c11 >> 32 + c12 >> 32 + (c0 >> 32 + c11 & 0xFFFF_FFFF + c12 & 0xFFFF_FFFF ) >> 32 166 +/ 167 168 const ulong a_low = a & 0xFFFF_FFFF; 169 const ulong a_high = a >>> 32; 170 const ulong b_low = b & 0xFFFF_FFFF; 171 const ulong b_high = b >>> 32; 172 173 const ulong c2 = a_high*b_high; 174 const ulong c11 = a_high*b_low; 175 const ulong c12 = a_low*b_high; 176 const ulong c0 = a_low*b_low; 177 178 const ulong common_term = (c0 >> 32) + (c11 & 0xFFFF_FFFF) + (c12 & 0xFFFF_FFFF); 179 const ulong z0 = (c0 & 0xFFFF_FFFF) + (common_term << 32); 180 const ulong z1 = c2 + (c11 >> 32) + (c12 >> 32) + (common_term >> 32); 181 182 *hi = z1; 183 return z0; 184 } 185 } 186 @system unittest 187 { 188 ulong hi; 189 // 0x1234_5678_9ABC_DEF0 * 0x1234_5678_9ABC_DEF0 == 0x14b_66dc_33f6_acdc_a5e2_0890_f2a5_2100 190 assert (_mulx_u64(0x1234_5678_9ABC_DEF0, 0x1234_5678_9ABC_DEF0, &hi) == 0xa5e2_0890_f2a5_2100); 191 assert (hi == 0x14b_66dc_33f6_acdc); 192 } 193 194 /// Deposit contiguous low bits from unsigned 32-bit integer `a` to dst at the corresponding bit locations specified by `mask`; all other bits in dst are set to zero. 195 uint _pdep_u32 (uint a, uint mask) 196 { 197 static if (BMI2_builtins) 198 { 199 if (!__ctfe) 200 return __builtin_ia32_pdep_si(a, mask); 201 else 202 return pdep!uint(a, mask); 203 } 204 else 205 { 206 return pdep!uint(a, mask); 207 } 208 } 209 unittest 210 { 211 static assert (_pdep_u32(0x1234_5678, 0x0F0F_0F0F) == 0x0506_0708); 212 assert (_pdep_u32(0x1234_5678, 0x0F0F_0F0F) == 0x0506_0708); 213 } 214 215 /// Deposit contiguous low bits from unsigned 64-bit integer `a` to dst at the corresponding bit locations specified by `mask`; all other bits in dst are set to zero. 216 ulong _pdep_u64 (ulong a, ulong mask) 217 { 218 static if (BMI2_builtins) 219 { 220 if (!__ctfe) 221 { 222 version(X86_64) 223 { 224 // This instruction not available in 32-bit x86. 225 return __builtin_ia32_pdep_di(a, mask); 226 } 227 else 228 return pdep!ulong(a, mask); 229 } 230 else 231 return pdep!ulong(a, mask); 232 } 233 else 234 { 235 return pdep!ulong(a, mask); 236 } 237 } 238 unittest 239 { 240 static assert (_pdep_u64(0x1234_5678_8765_4321, 0x0F0F_0F0F_0F0F_0F0F) == 0x0807_0605_0403_0201); 241 assert (_pdep_u64(0x1234_5678_8765_4321, 0x0F0F_0F0F_0F0F_0F0F) == 0x0807_0605_0403_0201); 242 } 243 244 // Helper function for PDEP 245 private T pdep(T)(T a, T mask) 246 { 247 /+ 248 tmp := a 249 dst := 0 250 m := 0 251 k := 0 252 DO WHILE m < 32 253 IF mask[m] == 1 254 dst[m] := tmp[k] 255 k := k + 1 256 FI 257 m := m + 1 258 OD 259 +/ 260 T dst; 261 T k_bitpos = 1; 262 T m_bitpos = 1; // for each iteration, this has one bit set to 1 in the position probed 263 foreach (m; 0..T.sizeof*8) 264 { 265 if (mask & m_bitpos) 266 { 267 dst |= (a & k_bitpos) ? m_bitpos : 0; 268 k_bitpos <<= 1; 269 } 270 m_bitpos <<= 1; 271 } 272 return dst; 273 } 274 275 276 /// Extract bits from unsigned 32-bit integer `a` at the corresponding bit locations specified by `mask` to contiguous low bits in dst; the remaining upper bits in dst are set to zero. 277 uint _pext_u32 (uint a, uint mask) 278 { 279 static if (BMI2_builtins) 280 { 281 if (!__ctfe) 282 return __builtin_ia32_pext_si(a, mask); 283 else 284 return pext!uint(a, mask); 285 } 286 else 287 { 288 return pext!uint(a, mask); 289 } 290 } 291 unittest 292 { 293 static assert (_pext_u32(0x1234_5678, 0x0F0F_0F0F) == 0x2468); 294 assert (_pext_u32(0x1234_5678, 0x0F0F_0F0F) == 0x2468); 295 } 296 297 /// Extract bits from unsigned 64-bit integer `a` at the corresponding bit locations specified by `mask` to contiguous low bits in dst; the remaining upper bits in dst are set to zero. 298 ulong _pext_u64 (ulong a, ulong mask) 299 { 300 static if (BMI2_builtins) 301 { 302 if (!__ctfe) 303 { 304 version(X86_64) 305 { 306 // This instruction not available in 32-bit x86. 307 return __builtin_ia32_pext_di(a, mask); 308 } 309 else 310 return pext!ulong(a, mask); 311 } 312 else 313 return pext!ulong(a, mask); 314 } 315 else 316 { 317 return pext!ulong(a, mask); 318 } 319 } 320 unittest 321 { 322 static assert (_pext_u64(0x1234_5678_8765_4321, 0x0F0F_0F0F_0F0F_0F0F) == 0x2468_7531); 323 assert (_pext_u64(0x1234_5678_8765_4321, 0x0F0F_0F0F_0F0F_0F0F) == 0x2468_7531); 324 } 325 326 // Helper function for PEXT 327 private T pext(T)(T a, T mask) 328 { 329 /+ 330 tmp := a 331 dst := 0 332 m := 0 333 k := 0 334 DO WHILE m < number of bits in T 335 IF mask[m] == 1 336 dst[k] := tmp[m] 337 k := k + 1 338 FI 339 m := m + 1 340 OD 341 +/ 342 T dst; 343 T k_bitpos = 1; 344 T m_bitpos = 1; // for each iteration, this has one bit set to 1 in the position probed 345 foreach (m; 0..T.sizeof*8) 346 { 347 if (mask & m_bitpos) 348 { 349 dst |= (a & m_bitpos) ? k_bitpos : 0; 350 k_bitpos <<= 1; 351 } 352 m_bitpos <<= 1; 353 } 354 return dst; 355 }