1 /** 2 * BMI2 intrinsics. 3 * https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#othertechs=BMI2 4 * 5 * Copyright: Copyright Johan Engelen 2021. 6 * License: $(LINK2 http://www.boost.org/LICENSE_1_0.txt, Boost License 1.0) 7 */ 8 module inteli.bmi2intrin; 9 10 import inteli.internals; 11 12 nothrow @nogc pure @safe: 13 14 /// Copy all bits from unsigned 32-bit integer `a` to dst, and reset (set to 0) the high bits in dst starting at index. 15 uint _bzhi_u32 (uint a, uint index) 16 { 17 static if (GDC_or_LDC_with_BMI2) 18 { 19 if (!__ctfe) 20 return __builtin_ia32_bzhi_si(a, index); 21 else 22 return bzhi!uint(a, index); 23 } 24 else 25 { 26 return bzhi!uint(a, index); 27 } 28 } 29 unittest 30 { 31 static assert (_bzhi_u32(0x1234_5678, 5) == 0x18); 32 assert (_bzhi_u32(0x1234_5678, 5) == 0x18); 33 static assert (_bzhi_u32(0x1234_5678, 10) == 0x278); 34 assert (_bzhi_u32(0x1234_5678, 10) == 0x278); 35 static assert (_bzhi_u32(0x1234_5678, 21) == 0x14_5678); 36 assert (_bzhi_u32(0x1234_5678, 21) == 0x14_5678); 37 } 38 39 /// Copy all bits from unsigned 64-bit integer `a` to dst, and reset (set to 0) the high bits in dst starting at index. 40 ulong _bzhi_u64 (ulong a, uint index) 41 { 42 static if (GDC_or_LDC_with_BMI2) 43 { 44 if (!__ctfe) 45 { 46 version(X86_64) 47 { 48 // This instruction not available in 32-bit x86. 49 return __builtin_ia32_bzhi_di(a, index); 50 } 51 else 52 return bzhi!ulong(a, index); 53 } 54 else 55 return bzhi!ulong(a, index); 56 } 57 else 58 { 59 return bzhi!ulong(a, index); 60 } 61 } 62 unittest 63 { 64 static assert (_bzhi_u64(0x1234_5678, 5) == 0x18); 65 assert (_bzhi_u64(0x1234_5678, 5) == 0x18); 66 static assert (_bzhi_u64(0x1234_5678, 10) == 0x278); 67 assert (_bzhi_u64(0x1234_5678, 10) == 0x278); 68 static assert (_bzhi_u64(0x1234_5678, 21) == 0x14_5678); 69 assert (_bzhi_u64(0x1234_5678, 21) == 0x14_5678); 70 static assert (_bzhi_u64(0x8765_4321_1234_5678, 54) == 0x0025_4321_1234_5678); 71 assert (_bzhi_u64(0x8765_4321_1234_5678, 54) == 0x0025_4321_1234_5678); 72 } 73 74 // Helper function for BZHI 75 private T bzhi(T)(T a, uint index) 76 { 77 /+ 78 n := index[7:0] 79 dst := a 80 IF (n < number of bits) 81 dst[MSB:n] := 0 82 FI 83 +/ 84 enum numbits = T.sizeof*8; 85 T dst = a; 86 if (index < numbits) 87 { 88 T mask = (T(1) << index) - 1; 89 dst &= mask; 90 } 91 return dst; 92 } 93 94 /// Multiply unsigned 32-bit integers `a` and `b`, store the low 32-bits of the result in dst, 95 /// and store the high 32-bits in `hi`. This does not read or write arithmetic flags. 96 /// Note: the implementation _does_ set arithmetic flags, unlike the instruction semantics say. 97 /// But, those particular semantics don't exist at the level of intrinsics. 98 uint _mulx_u32 (uint a, uint b, uint* hi) 99 { 100 // Note: that does NOT generate mulx with LDC, and there seems to be no way to do that for 101 // some reason, even with LLVM IR. 102 // Also same with GDC. 103 ulong result = cast(ulong) a * b; 104 *hi = cast(uint) (result >>> 32); 105 return cast(uint)result; 106 } 107 @system unittest 108 { 109 uint hi; 110 assert (_mulx_u32(0x1234_5678, 0x1234_5678, &hi) == 0x1DF4_D840); 111 assert (hi == 0x014B_66DC); 112 } 113 114 /// Multiply unsigned 64-bit integers `a` and `b`, store the low 64-bits of the result in dst, and 115 /// store the high 64-bits in `hi`. This does not read or write arithmetic flags. 116 /// Note: the implementation _does_ set arithmetic flags, unlike the instruction semantics say. 117 /// But, those particular semantics don't exist at the level of intrinsics. 118 ulong _mulx_u64 (ulong a, ulong b, ulong* hi) 119 { 120 /+ 121 dst[63:0] := (a * b)[63:0] 122 MEM[hi+63:hi] := (a * b)[127:64] 123 +/ 124 125 version(LDC) 126 { 127 // LDC x86: Generates mulx from -O0 128 enum ir = ` 129 %4 = zext i64 %0 to i128 130 %5 = zext i64 %1 to i128 131 %6 = mul nuw i128 %5, %4 132 %7 = lshr i128 %6, 64 133 %8 = trunc i128 %7 to i64 134 store i64 %8, i64* %2, align 8 135 %9 = trunc i128 %6 to i64 136 ret i64 %9`; 137 return LDCInlineIR!(ir, ulong, ulong, ulong, ulong*)(a, b, hi); 138 } 139 else 140 { 141 /+ Straight-forward implementation with `ucent`: 142 ucent result = cast(ucent) a * b; 143 *hi = cast(ulong) ((result >>> 64) & 0xFFFF_FFFF_FFFF_FFFF); 144 return cast(ulong) (result & 0xFFFF_FFFF_FFFF_FFFF); 145 +/ 146 147 /+ 148 Implementation using 64bit math is more complex... 149 a * b = (a_high << 32 + a_low) * (b_high << 32 + b_low) 150 = (a_high << 32)*(b_high << 32) + (a_high << 32)*b_low + a_low* (b_high << 32) + a_low*b_low 151 = (a_high*b_high) << 64 + (a_high*b_low) << 32 + (a_low*b_high) << 32 + a_low*b_low 152 = c2 << 64 + c11 << 32 + c12 << 32 + c0 153 = z1 << 64 + z0 154 // The sums may overflow, so we need to carry the carry (from low 64bits to high 64bits). We can do that 155 // by separately creating the sum to get the high 32 bits of z0 using 64bit math. The high 32 bits of that 156 // intermediate result is then the 'carry' that we need to add when calculating z1's sum. 157 z0 = (c0 & 0xFFFF_FFFF) + (c0 >> 32 + c11 & 0xFFFF_FFFF + c12 & 0xFFFF_FFFF ) << 32 158 The carry part from z0's sum = (c0 >> 32 + c11 & 0xFFFF_FFFF + c12 & 0xFFFF_FFFF ) >> 32 159 z1 = c2 + (c11 >> 32 + c12 >> 32 + (c0 >> 32 + c11 & 0xFFFF_FFFF + c12 & 0xFFFF_FFFF ) >> 32 160 +/ 161 162 const ulong a_low = a & 0xFFFF_FFFF; 163 const ulong a_high = a >>> 32; 164 const ulong b_low = b & 0xFFFF_FFFF; 165 const ulong b_high = b >>> 32; 166 167 const ulong c2 = a_high*b_high; 168 const ulong c11 = a_high*b_low; 169 const ulong c12 = a_low*b_high; 170 const ulong c0 = a_low*b_low; 171 172 const ulong common_term = (c0 >> 32) + (c11 & 0xFFFF_FFFF) + (c12 & 0xFFFF_FFFF); 173 const ulong z0 = (c0 & 0xFFFF_FFFF) + (common_term << 32); 174 const ulong z1 = c2 + (c11 >> 32) + (c12 >> 32) + (common_term >> 32); 175 176 *hi = z1; 177 return z0; 178 } 179 } 180 @system unittest 181 { 182 ulong hi; 183 // 0x1234_5678_9ABC_DEF0 * 0x1234_5678_9ABC_DEF0 == 0x14b_66dc_33f6_acdc_a5e2_0890_f2a5_2100 184 assert (_mulx_u64(0x1234_5678_9ABC_DEF0, 0x1234_5678_9ABC_DEF0, &hi) == 0xa5e2_0890_f2a5_2100); 185 assert (hi == 0x14b_66dc_33f6_acdc); 186 } 187 188 /// Deposit contiguous low bits from unsigned 32-bit integer `a` to dst at the corresponding bit locations specified by `mask`; all other bits in dst are set to zero. 189 uint _pdep_u32 (uint a, uint mask) 190 { 191 static if (GDC_or_LDC_with_BMI2) 192 { 193 if (!__ctfe) 194 return __builtin_ia32_pdep_si(a, mask); 195 else 196 return pdep!uint(a, mask); 197 } 198 else 199 { 200 return pdep!uint(a, mask); 201 } 202 } 203 unittest 204 { 205 static assert (_pdep_u32(0x1234_5678, 0x0F0F_0F0F) == 0x0506_0708); 206 assert (_pdep_u32(0x1234_5678, 0x0F0F_0F0F) == 0x0506_0708); 207 } 208 209 /// Deposit contiguous low bits from unsigned 64-bit integer `a` to dst at the corresponding bit locations specified by `mask`; all other bits in dst are set to zero. 210 ulong _pdep_u64 (ulong a, ulong mask) 211 { 212 static if (GDC_or_LDC_with_BMI2) 213 { 214 if (!__ctfe) 215 { 216 version(X86_64) 217 { 218 // This instruction not available in 32-bit x86. 219 return __builtin_ia32_pdep_di(a, mask); 220 } 221 else 222 return pdep!ulong(a, mask); 223 } 224 else 225 return pdep!ulong(a, mask); 226 } 227 else 228 { 229 return pdep!ulong(a, mask); 230 } 231 } 232 unittest 233 { 234 static assert (_pdep_u64(0x1234_5678_8765_4321, 0x0F0F_0F0F_0F0F_0F0F) == 0x0807_0605_0403_0201); 235 assert (_pdep_u64(0x1234_5678_8765_4321, 0x0F0F_0F0F_0F0F_0F0F) == 0x0807_0605_0403_0201); 236 } 237 238 // Helper function for PDEP 239 private T pdep(T)(T a, T mask) 240 { 241 /+ 242 tmp := a 243 dst := 0 244 m := 0 245 k := 0 246 DO WHILE m < 32 247 IF mask[m] == 1 248 dst[m] := tmp[k] 249 k := k + 1 250 FI 251 m := m + 1 252 OD 253 +/ 254 T dst; 255 T k_bitpos = 1; 256 T m_bitpos = 1; // for each iteration, this has one bit set to 1 in the position probed 257 foreach (m; 0..T.sizeof*8) 258 { 259 if (mask & m_bitpos) 260 { 261 dst |= (a & k_bitpos) ? m_bitpos : 0; 262 k_bitpos <<= 1; 263 } 264 m_bitpos <<= 1; 265 } 266 return dst; 267 } 268 269 270 /// Extract bits from unsigned 32-bit integer `a` at the corresponding bit locations specified by 271 /// `mask` to contiguous low bits in dst; the remaining upper bits in dst are set to zero. 272 uint _pext_u32 (uint a, uint mask) 273 { 274 static if (GDC_or_LDC_with_BMI2) 275 { 276 if (!__ctfe) 277 return __builtin_ia32_pext_si(a, mask); 278 else 279 return pext!uint(a, mask); 280 } 281 else 282 { 283 return pext!uint(a, mask); 284 } 285 } 286 unittest 287 { 288 static assert (_pext_u32(0x1234_5678, 0x0F0F_0F0F) == 0x2468); 289 assert (_pext_u32(0x1234_5678, 0x0F0F_0F0F) == 0x2468); 290 } 291 292 /// Extract bits from unsigned 64-bit integer `a` at the corresponding bit locations specified by 293 /// `mask` to contiguous low bits in dst; the remaining upper bits in dst are set to zero. 294 ulong _pext_u64 (ulong a, ulong mask) 295 { 296 static if (GDC_or_LDC_with_BMI2) 297 { 298 if (!__ctfe) 299 { 300 version(X86_64) 301 { 302 // This instruction not available in 32-bit x86. 303 return __builtin_ia32_pext_di(a, mask); 304 } 305 else 306 return pext!ulong(a, mask); 307 } 308 else 309 return pext!ulong(a, mask); 310 } 311 else 312 { 313 return pext!ulong(a, mask); 314 } 315 } 316 unittest 317 { 318 static assert (_pext_u64(0x1234_5678_8765_4321, 0x0F0F_0F0F_0F0F_0F0F) == 0x2468_7531); 319 assert (_pext_u64(0x1234_5678_8765_4321, 0x0F0F_0F0F_0F0F_0F0F) == 0x2468_7531); 320 } 321 322 // Helper function for PEXT 323 private T pext(T)(T a, T mask) 324 { 325 /+ 326 tmp := a 327 dst := 0 328 m := 0 329 k := 0 330 DO WHILE m < number of bits in T 331 IF mask[m] == 1 332 dst[k] := tmp[m] 333 k := k + 1 334 FI 335 m := m + 1 336 OD 337 +/ 338 T dst; 339 T k_bitpos = 1; 340 T m_bitpos = 1; // for each iteration, this has one bit set to 1 in the position probed 341 foreach (m; 0..T.sizeof*8) 342 { 343 if (mask & m_bitpos) 344 { 345 dst |= (a & m_bitpos) ? k_bitpos : 0; 346 k_bitpos <<= 1; 347 } 348 m_bitpos <<= 1; 349 } 350 return dst; 351 }