1 /** 2 * SSE4.2 intrinsics. 3 * https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#techs=SSSE3 4 * 5 * Copyright: Guillaume Piolat 2022. 6 * License: $(LINK2 http://www.boost.org/LICENSE_1_0.txt, Boost License 1.0) 7 */ 8 module inteli.nmmintrin; 9 10 public import inteli.types; 11 import inteli.internals; 12 public import inteli.smmintrin; 13 import core.bitop: bsf, bsr; 14 15 16 // Note: this header will work whether you have SSE4.2 enabled or not. 17 // With LDC, use "dflags-ldc": ["-mattr=+sse4.2"] or equivalent to actively 18 // generate SSE4.2 instruction (they are often enabled with -O1 or greater). 19 // - Additionally, you need ["-mattr=+crc"] on ARM if you want hardware CRC instructions. 20 // - Since LDC 1.30, you need ["-mattr=+crc32"] on x86_64 if you want hardware CRC instructions, 21 // it is not considered implied by sse4.2 anymore. 22 // With GDC, use "dflags-gdc": ["-msse4.2"] or equivalent to generate SSE4.2 instructions. 23 24 nothrow @nogc: 25 26 // <Data size and signedness> 27 28 /// String contains unsigned 8-bit characters (default). 29 enum int _SIDD_UBYTE_OPS = 0; 30 31 /// String contains unsigned 16-bit characters. 32 enum int _SIDD_UWORD_OPS = 1; 33 34 /// String contains signed 8-bit characters. 35 enum int _SIDD_SBYTE_OPS = 2; 36 37 /// String contains signed 16-bit characters. 38 enum int _SIDD_SWORD_OPS = 3; 39 40 // </Data size and signedness> 41 42 43 // <Comparison options> 44 45 /// For each character in `b`, find if it is in `a` (default) 46 /// The resulting mask has bit set at b positions that were found in a. 47 enum int _SIDD_CMP_EQUAL_ANY = 0; 48 49 /// For each character in `b`, determine if 50 /// `a[0] <= c <= a[1] or a[1] <= c <= a[2]...` 51 /// Contrarily to false documentation on the Internet, pairs must be in `a`! 52 enum int _SIDD_CMP_RANGES = 4; 53 54 /// The strings defined by `a` and `b` are equal 55 enum int _SIDD_CMP_EQUAL_EACH = 8; 56 57 /// Search for the defined substring in the target 58 enum int _SIDD_CMP_EQUAL_ORDERED = 12; 59 60 // </Comparison options> 61 62 // <Result polarity> 63 64 /// Do not negate results (default, no effect) 65 enum int _SIDD_POSITIVE_POLARITY = 0; 66 67 /// Negates results 68 enum int _SIDD_NEGATIVE_POLARITY = 16; 69 70 /// No effect. Do not negate results before the end of the string. (default when using `_SIDD_NEGATIVE_POLARITY`) 71 /// You basically never want this. 72 enum int _SIDD_MASKED_POSITIVE_POLARITY = 32; 73 74 /// Negates results only before the end of the string 75 enum int _SIDD_MASKED_NEGATIVE_POLARITY = 48; 76 77 // </Result polarity> 78 79 // <Bit returned> 80 81 /// **Index only**: return the least significant bit (default). 82 enum int _SIDD_LEAST_SIGNIFICANT = 0; 83 84 /// **Index only**: return the most significant bit. 85 enum int _SIDD_MOST_SIGNIFICANT = 64; 86 87 // </Bit returned> 88 89 /// **Mask only**: return the bit mask (default). 90 enum int _SIDD_BIT_MASK = 0; 91 92 /// **Mask only**: return the byte/word mask. 93 enum int _SIDD_UNIT_MASK = 64; 94 95 /// So SSE4.2 has a lot of hard-to-understand instructions. Here is another explanation. 96 /// 97 /// Alternative explanation of imm8 98 /// 99 /// imm8 is an 8-bit immediate operand specifying whether the characters are bytes or 100 /// words and the type of comparison to do. 101 /// 102 /// Bits [1:0]: Determine source data format. 103 /// 00: 16 unsigned bytes 104 /// 01: 8 unsigned words 105 /// 10: 16 signed bytes 106 /// 11: 8 signed words 107 /// 108 /// Bits [3:2]: Determine comparison type and aggregation method. 109 /// 00: Subset: Each character in B is compared for equality with all 110 /// the characters in A. 111 /// 01: Ranges: Each character in B is compared to A pairs. The comparison 112 /// basis is greater than or equal for even-indexed elements in A, 113 /// and less than or equal for odd-indexed elements in A. 114 /// 10: Match: Compare each pair of corresponding characters in A and 115 /// B for equality. 116 /// 11: Substring: Search B for substring matches of A. 117 /// 118 /// Bits [5:4]: Determine whether to do a one's complement on the bit 119 /// mask of the comparison results. \n 120 /// 00: No effect. \n 121 /// 01: Negate the bit mask. \n 122 /// 10: No effect. \n 123 /// 11: Negate the bit mask only for bits with an index less than or equal 124 /// to the size of \a A or \a B. 125 /// 126 127 128 129 /// Compare packed strings in `a` and `b` with lengths `la` and `lb` using 130 /// the control in `imm8`, and returns 1 if `b` "does not contain a null character" 131 /// and the resulting mask was zero, and 0 otherwise. 132 /// Warning: actually it seems the instruction does accept \0 in input, just the length must be >= count. 133 /// It's not clear for what purpose. 134 int _mm_cmpestra(int imm8)(__m128i a, int la, __m128i b, int lb) @trusted 135 { 136 static if (GDC_with_SSE42) 137 { 138 return cast(int) __builtin_ia32_pcmpestria128(cast(ubyte16)a, la, cast(ubyte16)b, lb, imm8); 139 } 140 else static if (LDC_with_SSE42) 141 { 142 return __builtin_ia32_pcmpestria128(cast(byte16)a, la, cast(byte16)b, lb, imm8); 143 } 144 else 145 { 146 __m128i mask = cmpstrMaskExplicit!imm8(a, la, b, lb); 147 __m128i equalZero = _mm_cmpeq_epi8(mask, _mm_setzero_si128()); 148 int sigbits = _mm_movemask_epi8(equalZero); 149 enum int Count = (imm8 & 1) ? 8 : 16; 150 return (sigbits == 0xffff) && (lb >= Count); 151 } 152 } 153 unittest 154 { 155 char[16] A = "Maximum\x00length!!"; 156 char[16] B = "Mbximum\x00length!!"; 157 __m128i mmA = _mm_loadu_si128(cast(__m128i*)A.ptr); 158 __m128i mmB = _mm_loadu_si128(cast(__m128i*)B.ptr); 159 160 // string matching a-la strcmp, for 16-bytes of data 161 // Use _SIDD_NEGATIVE_POLARITY since mask must be null, and all match must be one 162 assert(1 == _mm_cmpestra!(_SIDD_UBYTE_OPS 163 | _SIDD_CMP_EQUAL_EACH 164 | _SIDD_NEGATIVE_POLARITY)(mmA, 16, mmA, 16)); 165 assert(0 == _mm_cmpestra!(_SIDD_UBYTE_OPS 166 | _SIDD_CMP_EQUAL_EACH 167 | _SIDD_NEGATIVE_POLARITY)(mmA, 16, mmB, 16)); 168 169 // test negative length, this will be clamped to 16 170 assert(1 == _mm_cmpestra!(_SIDD_UBYTE_OPS 171 | _SIDD_CMP_EQUAL_EACH 172 | _SIDD_NEGATIVE_POLARITY)(mmA, -160, mmA, -17)); 173 174 // it seems you can't compare shorter strings for equality using _mm_cmpestra (!) 175 176 // Test 16-bit format 177 assert(1 == _mm_cmpestra!(_SIDD_SWORD_OPS 178 | _SIDD_CMP_EQUAL_EACH 179 | _SIDD_NEGATIVE_POLARITY)(mmA, 8, mmA, 8)); 180 } 181 182 /// Compare packed strings in `a` and `b` with lengths `la` and `lb` using 183 /// the control in `imm8`, and returns 1 if the resulting mask was non-zero, 184 /// and 0 otherwise. 185 int _mm_cmpestrc(int imm8)(__m128i a, int la, __m128i b, int lb) @trusted 186 { 187 static if (GDC_with_SSE42) 188 { 189 return cast(int) __builtin_ia32_pcmpestric128(cast(ubyte16)a, la, cast(ubyte16)b, lb, imm8); 190 } 191 else static if (LDC_with_SSE42) 192 { 193 return cast(int) __builtin_ia32_pcmpestric128(cast(byte16)a, la, cast(byte16)b, lb, imm8); 194 } 195 else 196 { 197 __m128i mask = cmpstrMaskExplicit!imm8(a, la, b, lb); 198 int sigbits = _mm_movemask_epi8(mask); 199 return (sigbits != 0); 200 } 201 } 202 unittest 203 { 204 // Compare two shorter strings 205 { 206 char[16] A = "Hello world"; 207 char[16] B = "Hello moon"; 208 __m128i mmA = _mm_loadu_si128(cast(__m128i*)A.ptr); 209 __m128i mmB = _mm_loadu_si128(cast(__m128i*)B.ptr); 210 __m128i mask = _mm_cmpestrm!(_SIDD_UBYTE_OPS // match gives 0 like strcmp 211 | _SIDD_CMP_EQUAL_EACH 212 | _SIDD_NEGATIVE_POLARITY)(mmA, 6, mmB, 6); 213 assert(0 == _mm_cmpestrc!(_SIDD_UBYTE_OPS // match gives 0 like strcmp 214 | _SIDD_CMP_EQUAL_EACH 215 | _SIDD_NEGATIVE_POLARITY)(mmA, 6, mmB, 6)); 216 assert(1 == _mm_cmpestrc!(_SIDD_UBYTE_OPS 217 | _SIDD_CMP_EQUAL_EACH 218 | _SIDD_NEGATIVE_POLARITY)(mmA, 7, mmB, 7)); 219 } 220 } 221 222 /// Compare packed strings in `a` and `b` with lengths `la` and `lb` using 223 /// the control in `imm8`, and return the generated index. 224 /// Note: if the mask is all zeroes, the returned index is always `Count` 225 /// (8 or 16 depending on size). 226 int _mm_cmpestri(int imm8)(__m128i a, int la, __m128i b, int lb) @trusted 227 { 228 static if (GDC_with_SSE42) 229 { 230 return __builtin_ia32_pcmpestri128(cast(ubyte16)a, la, cast(ubyte16)b, lb, imm8); 231 } 232 else static if (LDC_with_SSE42) 233 { 234 return __builtin_ia32_pcmpestri128(cast(byte16)a, la, cast(byte16)b, lb, imm8); 235 } 236 else 237 { 238 __m128i mask = cmpstrMaskExplicit!imm8(a, la, b, lb); 239 240 // Convert the unit mask to bit mask 241 static if (imm8 & 1) 242 { 243 enum int Count = 8; 244 mask = _mm_packs_epi16(mask, _mm_setzero_si128()); 245 } 246 else 247 { 248 enum int Count = 16; 249 } 250 int signbits = _mm_movemask_epi8(mask); 251 static if (imm8 & _SIDD_MOST_SIGNIFICANT) 252 { 253 if (signbits == 0) 254 return Count; 255 else 256 return bsr(signbits); 257 } 258 else 259 { 260 if (signbits == 0) 261 return Count; 262 else 263 return bsf(signbits); 264 } 265 } 266 } 267 unittest 268 { 269 // Find the index of the first difference (at index 6) 270 // v 271 char[16] A = "Hello sun"; 272 char[16] B = "Hello moon"; 273 274 __m128i mmA = _mm_loadu_si128(cast(__m128i*)A.ptr); 275 __m128i mmB = _mm_loadu_si128(cast(__m128i*)B.ptr); 276 277 int index = _mm_cmpestri!(_SIDD_UBYTE_OPS 278 | _SIDD_CMP_EQUAL_EACH 279 | _SIDD_NEGATIVE_POLARITY 280 | _SIDD_LEAST_SIGNIFICANT)(mmA, 9, mmB, 10); 281 assert(index == 6); 282 283 // Those string must compare equal, regardless of what happens after their length. 284 index = _mm_cmpestri!(_SIDD_UBYTE_OPS 285 | _SIDD_CMP_EQUAL_EACH 286 | _SIDD_NEGATIVE_POLARITY 287 | _SIDD_LEAST_SIGNIFICANT)(mmA, 6, mmB, 6); // only look first six chars 288 assert(index == 16); 289 290 index = _mm_cmpestri!(_SIDD_UBYTE_OPS 291 | _SIDD_CMP_EQUAL_EACH 292 | _SIDD_NEGATIVE_POLARITY 293 | _SIDD_MOST_SIGNIFICANT)(mmA, 6, mmB, 6); // only look first six chars 294 assert(index == 16); 295 } 296 unittest 297 { 298 // Identify the last character that isn't an identifier character. 299 // v (at index 7) 300 char[16] A = "my_i(en)ifie"; 301 char[16] identRanges = "__azAz09"; 302 __m128i mmA = _mm_loadu_si128(cast(__m128i*)A.ptr); 303 __m128i mmI = _mm_loadu_si128(cast(__m128i*)identRanges.ptr); 304 byte16 mask = cast(byte16)_mm_cmpestrm!(_SIDD_UBYTE_OPS 305 | _SIDD_CMP_RANGES 306 | _SIDD_MASKED_NEGATIVE_POLARITY 307 | _SIDD_UNIT_MASK)(mmI, 8, mmA, 12); 308 byte[16] correctM = [0, 0, 0, 0, -1, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0]; 309 assert(mask.array == correctM); 310 311 int index = _mm_cmpestri!(_SIDD_UBYTE_OPS 312 | _SIDD_CMP_RANGES 313 | _SIDD_MASKED_NEGATIVE_POLARITY 314 | _SIDD_MOST_SIGNIFICANT)(mmI, 8, mmA, 12); 315 assert(index == 7); // ')' is the last char not to be in [__azAz09] 316 } 317 unittest 318 { 319 // testing _SIDD_CMP_RANGES but with signed shorts comparison instead (this only makes sense for _SIDD_CMP_RANGES) 320 short[8] ranges = [0, -1, 1000, 2000, 0, 0, 0, 0]; 321 short[8] numbers = [-32768, -1000, -1, -0, 0, 1, 1000, 32767]; 322 __m128i mmRanges = _mm_loadu_si128(cast(__m128i*)ranges.ptr); 323 __m128i mmNumbers = _mm_loadu_si128(cast(__m128i*)numbers.ptr); 324 325 short8 mask = cast(short8)_mm_cmpestrm!(_SIDD_UWORD_OPS 326 | _SIDD_CMP_RANGES 327 | _SIDD_UNIT_MASK)(mmRanges, 4, mmNumbers, 8); 328 short[8] correctM = [ -1, -1, -1, -1, -1, -1, -1, -1]; 329 mask = cast(short8)_mm_cmpestrm!(_SIDD_SWORD_OPS 330 | _SIDD_CMP_RANGES 331 | _SIDD_UNIT_MASK)(mmRanges, 4, mmNumbers, 8); 332 short[8] correctZ = [ 0, 0, 0, 0, 0, 0, -1, 0]; 333 assert(mask.array == correctZ); 334 } 335 unittest 336 { 337 // Find a substring 338 char[16] A = "def"; 339 char[16] B = "abcdefghdefff"; 340 char[16] C = "no substring"; 341 __m128i mmA = _mm_loadu_si128(cast(__m128i*)A.ptr); 342 __m128i mmB = _mm_loadu_si128(cast(__m128i*)B.ptr); 343 __m128i mmC = _mm_loadu_si128(cast(__m128i*)C.ptr); 344 345 byte16 mask = cast(byte16)_mm_cmpestrm!(_SIDD_UBYTE_OPS 346 | _SIDD_CMP_EQUAL_ORDERED 347 | _SIDD_UNIT_MASK)(mmA, 3, mmB, 13); 348 byte[16] correctM = [0, 0, 0, -1, 0, 0, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0]; 349 assert(mask.array == correctM); 350 351 int firstMatch = _mm_cmpestri!(_SIDD_UBYTE_OPS 352 | _SIDD_CMP_EQUAL_ORDERED)(mmA, 3, mmB, 13); 353 assert(firstMatch == 3); 354 355 int lastMatch = _mm_cmpestri!(_SIDD_UBYTE_OPS 356 | _SIDD_CMP_EQUAL_ORDERED 357 | _SIDD_MOST_SIGNIFICANT)(mmA, 3, mmB, 13); 358 assert(lastMatch == 8); 359 firstMatch = _mm_cmpestri!(_SIDD_UBYTE_OPS 360 | _SIDD_CMP_EQUAL_ORDERED)(mmA, -3, mmC, -12); 361 assert(firstMatch == 16); // no substring found 362 } 363 364 /// Compare packed strings in `a` and `b` with lengths `la` and `lb` using 365 /// the control in `imm8`, and return the generated mask. 366 __m128i _mm_cmpestrm(int imm8)(__m128i a, int la, __m128i b, int lb) @trusted 367 { 368 static if (GDC_with_SSE42) 369 { 370 return cast(__m128i) __builtin_ia32_pcmpestrm128(cast(ubyte16)a, la, cast(ubyte16)b, lb, imm8); 371 } 372 else static if (LDC_with_SSE42) 373 { 374 return cast(__m128i) __builtin_ia32_pcmpestrm128(cast(byte16)a, la, cast(byte16)b, lb, imm8); 375 } 376 else 377 { 378 __m128i mask = cmpstrMaskExplicit!imm8(a, la, b, lb); 379 380 static if (imm8 & _SIDD_UNIT_MASK) 381 { 382 return mask; 383 } 384 else 385 { 386 // _SIDD_BIT_MASK 387 static if (imm8 & 1) 388 { 389 mask = _mm_packs_epi16(mask, _mm_setzero_si128()); 390 } 391 return _mm_cvtsi32_si128( _mm_movemask_epi8(mask)); 392 } 393 } 394 } 395 unittest 396 { 397 char[16] A = "Hello world!"; 398 char[16] B = "aeiou!"; 399 __m128i mmA = _mm_loadu_si128(cast(__m128i*)A.ptr); 400 __m128i mmB = _mm_loadu_si128(cast(__m128i*)B.ptr); 401 402 // Find which letters from B where found in A. 403 byte16 R = cast(byte16)_mm_cmpestrm!(_SIDD_UBYTE_OPS 404 | _SIDD_CMP_EQUAL_ANY 405 | _SIDD_BIT_MASK)(mmA, -12, mmB, -6); 406 // because 'e', 'o', and '!' were found 407 byte[16] correctR = [42, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]; 408 assert(R.array == correctR); 409 byte16 M = cast(byte16) _mm_cmpestrm!(_SIDD_UBYTE_OPS 410 | _SIDD_CMP_EQUAL_ANY 411 | _SIDD_UNIT_MASK)(mmA, 12, mmB, 6); 412 byte[16] correctM = [0, -1, 0, -1, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]; 413 assert(M.array == correctM); 414 } 415 416 /// Compare packed strings in `a` and `b` with lengths `la` and `lb` using 417 /// the control in `imm8`, and returns bit 0 of the resulting bit mask. 418 int _mm_cmpestro(int imm8)(__m128i a, int la, __m128i b, int lb) @trusted 419 { 420 static if (GDC_with_SSE42) 421 { 422 return __builtin_ia32_pcmpestrio128(cast(ubyte16)a, la, cast(ubyte16)b, lb, imm8); 423 } 424 else static if (LDC_with_SSE42) 425 { 426 return __builtin_ia32_pcmpestrio128(cast(byte16)a, la, cast(byte16)b, lb, imm8); 427 } 428 else 429 { 430 int4 mask = cast(int4) cmpstrMaskExplicit!imm8(a, la, b, lb); 431 return mask.array[0] & 1; 432 } 433 } 434 unittest 435 { 436 char[16] A = "Hallo world!"; 437 char[16] B = "aeiou!"; 438 __m128i mmA = _mm_loadu_si128(cast(__m128i*)A.ptr); 439 __m128i mmB = _mm_loadu_si128(cast(__m128i*)B.ptr); 440 441 // Find which letters from B were found in A. 442 int res = _mm_cmpestro!(_SIDD_UBYTE_OPS 443 | _SIDD_CMP_EQUAL_ANY 444 | _SIDD_BIT_MASK)(mmA, 12, mmB, -6); 445 // because 'a' was found in "Hallo world!" 446 assert(res == 1); 447 } 448 449 /// Returns 1 if "any character in a was null", and 0 otherwise. 450 /// Warning: what they mean is it returns 1 if the given length `la` is < Count. 451 int _mm_cmpestrs(int imm8)(__m128i a, int la, __m128i b, int lb) @trusted 452 { 453 static if (GDC_with_SSE42) 454 { 455 return __builtin_ia32_pcmpestris128(cast(ubyte16)a, la, cast(ubyte16)b, lb, imm8); 456 } 457 else static if (LDC_with_SSE42) 458 { 459 return __builtin_ia32_pcmpestris128(cast(byte16)a, la, cast(byte16)b, lb, imm8); 460 } 461 else 462 { 463 // Yes, this intrinsic is there for symmetrical reasons and probably useless. 464 // saturates lengths (the Intrinsics Guide doesn't tell this) 465 if (la < 0) la = -la; 466 if (la > 16) la = 16; 467 enum int Count = (imm8 & 1) ? 8 : 16; 468 return (la < Count); 469 } 470 } 471 unittest 472 { 473 __m128i a; 474 a = 0; 475 assert(_mm_cmpestrs!_SIDD_UBYTE_OPS(a, 15, a, 8) == 1); 476 assert(_mm_cmpestrs!_SIDD_UBYTE_OPS(a, 16, a, 8) == 0); 477 assert(_mm_cmpestrs!_SIDD_UBYTE_OPS(a, -15, a, 8) == 1); 478 assert(_mm_cmpestrs!_SIDD_UBYTE_OPS(a, -16, a, 8) == 0); 479 } 480 481 /// Returns 1 if "any character in b was null", and 0 otherwise. 482 /// Warning: what they mean is it returns 1 if the given length `lb` is < Count. 483 int _mm_cmpestrz(int imm8)(__m128i a, int la, __m128i b, int lb) @trusted 484 { 485 static if (GDC_with_SSE42) 486 { 487 return __builtin_ia32_pcmpestriz128(cast(ubyte16)a, la, cast(ubyte16)b, lb, imm8); 488 } 489 else static if (LDC_with_SSE42) 490 { 491 return __builtin_ia32_pcmpestriz128(cast(byte16)a, la, cast(byte16)b, lb, imm8); 492 } 493 else 494 { 495 // Yes, this intrinsic is there for symmetrical reasons and probably useless. 496 // saturates lengths (the Intrinsics Guide doesn't tell this) 497 if (lb < 0) lb = -lb; 498 if (lb > 16) lb = 16; 499 enum int Count = (imm8 & 1) ? 8 : 16; 500 return (lb < Count); 501 } 502 } 503 unittest 504 { 505 __m128i b; 506 b = 0; 507 assert(_mm_cmpestrs!_SIDD_UBYTE_OPS(b, 15, b, 15) == 1); 508 assert(_mm_cmpestrs!_SIDD_UBYTE_OPS(b, 16, b, 16) == 0); 509 assert(_mm_cmpestrs!_SIDD_UBYTE_OPS(b, -15, b, -15) == 1); 510 assert(_mm_cmpestrs!_SIDD_UBYTE_OPS(b, -16, b, -16) == 0); 511 } 512 513 /// Compare packed signed 64-bit integers in a and b for greater-than. 514 __m128i _mm_cmpgt_epi64 (__m128i a, __m128i b) @trusted 515 { 516 long2 la = cast(long2)a; 517 long2 lb = cast(long2)b; 518 static if (GDC_with_SSE42) 519 { 520 return cast(__m128i) __builtin_ia32_pcmpgtq(la, lb); 521 } 522 else version(LDC) 523 { 524 // LDC x86: Optimized since LDC 1.1.0 -O1 525 // arm64: Optimized since LDC 1.8.0 -O1 526 // When SSE4.2 is disabled, this gives same sequence than below. 527 return cast(__m128i)( greaterMask!long2(la, lb)); 528 } 529 else 530 { 531 long2 r; 532 r.ptr[0] = (la.array[0] > lb.array[0]) ? 0xffffffff_ffffffff : 0; 533 r.ptr[1] = (la.array[1] > lb.array[1]) ? 0xffffffff_ffffffff : 0; 534 return cast(__m128i)r; 535 } 536 } 537 unittest 538 { 539 __m128i A = _mm_setr_epi64(-3, 2); 540 __m128i B = _mm_setr_epi64(4, -2); 541 long[2] correct = [ 0, -1 ]; 542 long2 R = cast(long2)(_mm_cmpgt_epi32(A, B)); 543 assert(R.array == correct); 544 } 545 546 /// Compare packed strings with implicit lengths in `a` and `b` using the control in `imm8`, 547 /// and returns 1 if `b` did not contain a null character and the resulting mask was zero, 548 /// and 0 otherwise. 549 int _mm_cmpistra(int imm8)(__m128i a, __m128i b) @trusted 550 { 551 static if (GDC_with_SSE42) 552 { 553 return cast(int) __builtin_ia32_pcmpistria128(cast(ubyte16)a, cast(ubyte16)b, imm8); 554 } 555 else static if (LDC_with_SSE42) 556 { 557 return __builtin_ia32_pcmpistria128(cast(byte16)a, cast(byte16)b, imm8); 558 } 559 else 560 { 561 static if (imm8 & 1) 562 { 563 int la = findLengthShort(a); 564 int lb = findLengthShort(b); 565 } 566 else 567 { 568 int la = findLengthByte(a); 569 int lb = findLengthByte(b); 570 } 571 return _mm_cmpestra!imm8(a, la, b, lb); 572 } 573 } 574 unittest 575 { 576 char[16] A = "Maximum\x00one"; 577 char[16] B = "Maximum\x00four"; 578 char[16] C = "Mbximum\x00length!"; 579 __m128i mmA = _mm_loadu_si128(cast(__m128i*)A.ptr); 580 __m128i mmB = _mm_loadu_si128(cast(__m128i*)B.ptr); 581 __m128i mmC = _mm_loadu_si128(cast(__m128i*)C.ptr); 582 583 // string matching a-la strcmp, for 16-bytes of data 584 // Use _SIDD_NEGATIVE_POLARITY since mask must be null, and all match must be one 585 assert(0 == _mm_cmpistra!(_SIDD_UBYTE_OPS 586 | _SIDD_CMP_EQUAL_EACH 587 | _SIDD_MASKED_NEGATIVE_POLARITY)(mmA, mmB)); // match, but b is too short 588 589 assert(0 == _mm_cmpistra!(_SIDD_UBYTE_OPS 590 | _SIDD_CMP_EQUAL_EACH 591 | _SIDD_NEGATIVE_POLARITY)(mmA, mmC)); // do not match 592 } 593 594 /// Compare packed strings with implicit lengths in `a` and `b` using the control in `imm8`, 595 /// and returns 1 if the resulting mask was non-zero, and 0 otherwise. 596 int _mm_cmpistrc(int imm8)(__m128i a, __m128i b) @trusted 597 { 598 static if (GDC_with_SSE42) 599 { 600 return cast(int) __builtin_ia32_pcmpistric128(cast(ubyte16)a, cast(ubyte16)b, imm8); 601 } 602 else static if (LDC_with_SSE42) 603 { 604 return cast(int) __builtin_ia32_pcmpistric128(cast(byte16)a, cast(byte16)b, imm8); 605 } 606 else 607 { 608 static if (imm8 & 1) 609 { 610 int la = findLengthShort(a); 611 int lb = findLengthShort(b); 612 } 613 else 614 { 615 int la = findLengthByte(a); 616 int lb = findLengthByte(b); 617 } 618 return _mm_cmpestrc!imm8(a, la, b, lb); 619 } 620 } 621 unittest 622 { 623 // Compare two shorter strings 624 { 625 char[16] A = "Hello"; 626 char[16] B = "Hello moon"; 627 __m128i mmA = _mm_loadu_si128(cast(__m128i*)A.ptr); 628 __m128i mmB = _mm_loadu_si128(cast(__m128i*)B.ptr); 629 assert(0 == _mm_cmpistrc!(_SIDD_UBYTE_OPS // match gives 0 like strcmp 630 | _SIDD_CMP_EQUAL_EACH 631 | _SIDD_NEGATIVE_POLARITY)(mmA, mmA)); 632 assert(1 == _mm_cmpistrc!(_SIDD_UBYTE_OPS 633 | _SIDD_CMP_EQUAL_EACH 634 | _SIDD_NEGATIVE_POLARITY)(mmA, mmB)); 635 } 636 } 637 638 /// Compare packed strings with implicit lengths in `a` and `b` using the control in `imm8` 639 /// and return the generated index. 640 /// Note: if the mask is all zeroes, the returned index is always `Count` 641 /// (8 or 16 depending on size). 642 int _mm_cmpistri(int imm8)(__m128i a, __m128i b) @trusted 643 { 644 static if (GDC_with_SSE42) 645 { 646 return __builtin_ia32_pcmpistri128(cast(ubyte16)a, cast(ubyte16)b, imm8); 647 } 648 else static if (LDC_with_SSE42) 649 { 650 return __builtin_ia32_pcmpistri128(cast(byte16)a, cast(byte16)b, imm8); 651 } 652 else 653 { 654 static if (imm8 & 1) 655 { 656 int la = findLengthShort(a); 657 int lb = findLengthShort(b); 658 } 659 else 660 { 661 int la = findLengthByte(a); 662 int lb = findLengthByte(b); 663 } 664 return _mm_cmpestri!imm8(a, la, b, lb); 665 } 666 } 667 unittest 668 { 669 // Identify the last character that isn't an identifier character. 670 // v (at index 7) 671 char[16] A = "my_i(en)ifie"; 672 char[16] identRanges = "__azAz09"; 673 __m128i mmA = _mm_loadu_si128(cast(__m128i*)A.ptr); 674 __m128i mmI = _mm_loadu_si128(cast(__m128i*)identRanges.ptr); 675 byte16 mask = cast(byte16)_mm_cmpistrm!(_SIDD_UBYTE_OPS 676 | _SIDD_CMP_RANGES 677 | _SIDD_MASKED_NEGATIVE_POLARITY 678 | _SIDD_UNIT_MASK)(mmI, mmA); 679 byte[16] correctM = [0, 0, 0, 0, -1, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0]; 680 assert(mask.array == correctM); 681 682 int index = _mm_cmpistri!(_SIDD_UBYTE_OPS 683 | _SIDD_CMP_RANGES 684 | _SIDD_MASKED_NEGATIVE_POLARITY 685 | _SIDD_MOST_SIGNIFICANT)(mmI, mmA); 686 assert(index == 7); // ')' is the last char not to be in [__azAz09] 687 } 688 689 /// Compare packed strings with implicit lengths in `a` and `b` using the control in 690 /// `imm8`, and return the generated mask. 691 __m128i _mm_cmpistrm(int imm8)(__m128i a, __m128i b) @trusted 692 { 693 static if (GDC_with_SSE42) 694 { 695 return cast(__m128i) __builtin_ia32_pcmpistrm128(cast(ubyte16)a, cast(ubyte16)b, imm8); 696 } 697 else static if (LDC_with_SSE42) 698 { 699 return cast(__m128i) __builtin_ia32_pcmpistrm128(cast(byte16)a, cast(byte16)b, imm8); 700 } 701 else 702 { 703 static if (imm8 & 1) 704 { 705 int la = findLengthShort(a); 706 int lb = findLengthShort(b); 707 } 708 else 709 { 710 int la = findLengthByte(a); 711 int lb = findLengthByte(b); 712 } 713 return _mm_cmpestrm!imm8(a, la, b, lb); 714 } 715 } 716 unittest 717 { 718 char[16] A = "Hello world!"; 719 char[16] B = "aeiou!"; 720 __m128i mmA = _mm_loadu_si128(cast(__m128i*)A.ptr); 721 __m128i mmB = _mm_loadu_si128(cast(__m128i*)B.ptr); 722 723 // Find which letters from B where found in A. 724 byte16 R = cast(byte16)_mm_cmpistrm!(_SIDD_UBYTE_OPS 725 | _SIDD_CMP_EQUAL_ANY 726 | _SIDD_BIT_MASK)(mmA, mmB); 727 // because 'e', 'o', and '!' were found 728 byte[16] correctR = [42, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]; 729 assert(R.array == correctR); 730 byte16 M = cast(byte16) _mm_cmpistrm!(_SIDD_UBYTE_OPS 731 | _SIDD_CMP_EQUAL_ANY 732 | _SIDD_UNIT_MASK)(mmA, mmB); 733 byte[16] correctM = [0, -1, 0, -1, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]; 734 assert(M.array == correctM); 735 } 736 737 /// Compare packed strings in `a` and `b` with lengths `la` and `lb` using 738 /// the control in `imm8`, and returns bit 0 of the resulting bit mask. 739 int _mm_cmpistro(int imm8)(__m128i a, __m128i b) @trusted 740 { 741 static if (GDC_with_SSE42) 742 { 743 return __builtin_ia32_pcmpistrio128(cast(ubyte16)a, cast(ubyte16)b, imm8); 744 } 745 else static if (LDC_with_SSE42) 746 { 747 return __builtin_ia32_pcmpistrio128(cast(byte16)a, cast(byte16)b, imm8); 748 } 749 else 750 { 751 static if (imm8 & 1) 752 { 753 int la = findLengthShort(a); 754 int lb = findLengthShort(b); 755 } 756 else 757 { 758 int la = findLengthByte(a); 759 int lb = findLengthByte(b); 760 } 761 return _mm_cmpestro!imm8(a, la, b, lb); 762 } 763 } 764 unittest 765 { 766 char[16] A = "Hallo world!"; 767 char[16] B = "aeiou!"; 768 char[16] C = "Z"; 769 __m128i mmA = _mm_loadu_si128(cast(__m128i*)A.ptr); 770 __m128i mmB = _mm_loadu_si128(cast(__m128i*)B.ptr); 771 __m128i mmC = _mm_loadu_si128(cast(__m128i*)C.ptr); 772 773 // Find which letters from B where found in A. 774 int res = _mm_cmpistro!(_SIDD_UBYTE_OPS 775 | _SIDD_CMP_EQUAL_ANY 776 | _SIDD_BIT_MASK)(mmA, mmB); 777 // because 'a' was found in "Hallo world!" 778 assert(res == 1); 779 res = _mm_cmpistro!(_SIDD_UBYTE_OPS 780 | _SIDD_CMP_EQUAL_ANY 781 | _SIDD_BIT_MASK)(mmA, mmC); 782 assert(res == 0); // because 'Z' wasn't found in A 783 } 784 785 /// Returns 1 if any character in `a` was null, and 0 otherwise. 786 int _mm_cmpistrs(int imm8)(__m128i a, __m128i b) @trusted 787 { 788 static if (GDC_with_SSE42) 789 { 790 return __builtin_ia32_pcmpistris128(cast(ubyte16)a, cast(ubyte16)b, imm8); 791 } 792 else static if (LDC_with_SSE42) 793 { 794 return __builtin_ia32_pcmpistris128(cast(byte16)a, cast(byte16)b, imm8); 795 } 796 else 797 { 798 static if (imm8 & 1) 799 { 800 int la = findLengthShort(a); 801 return la != 8; 802 } 803 else 804 { 805 int la = findLengthByte(a); 806 return la != 16; 807 } 808 } 809 } 810 unittest 811 { 812 char[16] A = ""; 813 char[16] B = "hello"; 814 char[16] C = "Maximum length!!"; 815 __m128i mmA = _mm_loadu_si128(cast(__m128i*)A.ptr); 816 __m128i mmB = _mm_loadu_si128(cast(__m128i*)B.ptr); 817 __m128i mmC = _mm_loadu_si128(cast(__m128i*)C.ptr); 818 assert(_mm_cmpistrs!_SIDD_UBYTE_OPS(mmA, mmA) == 1); 819 assert(_mm_cmpistrs!_SIDD_SBYTE_OPS(mmB, mmB) == 1); 820 assert(_mm_cmpistrs!_SIDD_UWORD_OPS(mmC, mmC) == 0); 821 } 822 823 /// Returns 1 if any character in `b` was null, and 0 otherwise. 824 int _mm_cmpistrz(int imm8)(__m128i a, __m128i b) @trusted 825 { 826 static if (GDC_with_SSE42) 827 { 828 return __builtin_ia32_pcmpistriz128(cast(ubyte16)a, cast(ubyte16)b, imm8); 829 } 830 else static if (LDC_with_SSE42) 831 { 832 return __builtin_ia32_pcmpistriz128(cast(byte16)a, cast(byte16)b, imm8); 833 } 834 else 835 { 836 static if (imm8 & 1) 837 { 838 int lb = findLengthShort(b); 839 return lb != 8; 840 } 841 else 842 { 843 int lb = findLengthByte(b); 844 return lb != 16; 845 } 846 } 847 } 848 unittest 849 { 850 char[16] A = ""; 851 char[16] B = "hello"; 852 char[16] C = "Maximum length!!"; 853 __m128i mmA = _mm_loadu_si128(cast(__m128i*)A.ptr); 854 __m128i mmB = _mm_loadu_si128(cast(__m128i*)B.ptr); 855 __m128i mmC = _mm_loadu_si128(cast(__m128i*)C.ptr); 856 assert(_mm_cmpistrz!_SIDD_UBYTE_OPS(mmC, mmA) == 1); 857 assert(_mm_cmpistrz!_SIDD_SBYTE_OPS(mmC, mmB) == 1); 858 assert(_mm_cmpistrz!_SIDD_UWORD_OPS(mmA, mmC) == 0); 859 } 860 861 862 /// Starting with the initial value in `crc`, accumulates a CR32 value 863 /// for unsigned 16-bit integer `v`. 864 /// Warning: this is computing CRC-32C (Castagnoli), not CRC-32. 865 uint _mm_crc32_u16 (uint crc, ushort v) @safe 866 { 867 static if (GDC_with_SSE42) 868 { 869 return __builtin_ia32_crc32hi(crc, v); 870 } 871 else static if (LDC_with_CRC32) 872 { 873 return __builtin_ia32_crc32hi(crc, v); 874 } 875 else static if (LDC_with_ARM64_CRC) 876 { 877 return __crc32ch(crc, v); 878 } 879 else 880 { 881 crc = _mm_crc32_u8(crc, v & 0xff); 882 crc = _mm_crc32_u8(crc, v >> 8); 883 return crc; 884 } 885 } 886 unittest 887 { 888 uint A = _mm_crc32_u16(0x12345678, 0x4512); 889 uint B = _mm_crc32_u16(0x76543210, 0xf50f); 890 uint C = _mm_crc32_u16(0xDEADBEEF, 0x0017); 891 assert(A == 0x39c3f0ff); 892 assert(B == 0xcffbcf07); 893 assert(C == 0xc7e3fe85); 894 } 895 896 /// Starting with the initial value in `crc`, accumulates a CRC32 value 897 /// for unsigned 32-bit integer `v`. 898 /// Warning: this is computing CRC-32C (Castagnoli), not CRC-32. 899 uint _mm_crc32_u32 (uint crc, uint v) @safe 900 { 901 static if (GDC_with_SSE42) 902 { 903 return __builtin_ia32_crc32si(crc, v); 904 } 905 else static if (LDC_with_CRC32) 906 { 907 return __builtin_ia32_crc32si(crc, v); 908 } 909 else static if (LDC_with_ARM64_CRC) 910 { 911 return __crc32cw(crc, v); 912 } 913 else 914 { 915 crc = _mm_crc32_u8(crc, v & 0xff); 916 crc = _mm_crc32_u8(crc, (v >> 8) & 0xff); 917 crc = _mm_crc32_u8(crc, (v >> 16) & 0xff); 918 crc = _mm_crc32_u8(crc, (v >> 24) & 0xff); 919 return crc; 920 } 921 } 922 unittest 923 { 924 uint A = _mm_crc32_u32(0x12345678, 0x45123563); 925 uint B = _mm_crc32_u32(0x76543210, 0xf50f9993); 926 uint C = _mm_crc32_u32(0xDEADBEEF, 0x00170017); 927 assert(A == 0x22a6ec54); 928 assert(B == 0x7019a6cf); 929 assert(C == 0xbc552c27); 930 } 931 932 /// Starting with the initial value in `crc`, accumulates a CRC32 933 /// value for unsigned 64-bit integer `v`. 934 /// Warning: this is computing CRC-32C (Castagnoli), not CRC-32. 935 ulong _mm_crc32_u64 (ulong crc, ulong v) 936 { 937 version(X86_64) 938 enum bool hasX86Intrin = GDC_with_SSE42 || LDC_with_CRC32; 939 else 940 enum bool hasX86Intrin = false; // intrinsics not available in 32-bit 941 942 static if (hasX86Intrin) 943 { 944 return __builtin_ia32_crc32di(crc, v); 945 } 946 else static if (LDC_with_ARM64_CRC) 947 { 948 return __crc32cd(cast(uint)crc, v); 949 } 950 else 951 { 952 uint crc32 = cast(uint)crc; 953 crc32 = _mm_crc32_u8(crc32, (v >> 0) & 0xff); 954 crc32 = _mm_crc32_u8(crc32, (v >> 8) & 0xff); 955 crc32 = _mm_crc32_u8(crc32, (v >> 16) & 0xff); 956 crc32 = _mm_crc32_u8(crc32, (v >> 24) & 0xff); 957 crc32 = _mm_crc32_u8(crc32, (v >> 32) & 0xff); 958 crc32 = _mm_crc32_u8(crc32, (v >> 40) & 0xff); 959 crc32 = _mm_crc32_u8(crc32, (v >> 48) & 0xff); 960 crc32 = _mm_crc32_u8(crc32, (v >> 56) & 0xff); 961 return crc32; 962 } 963 } 964 unittest 965 { 966 ulong A = _mm_crc32_u64(0x1234567812345678, 0x39C3F0FFCFFBCF07); 967 ulong B = _mm_crc32_u64(0x7654321001234567, 0xFACEFEED); 968 ulong C = _mm_crc32_u64(0xDEADBEEFCAFEBABE, 0x0017C7E3FE850017); 969 assert(A == 0xd66b1074); 970 assert(B == 0xac12f9c6); 971 assert(C == 0xa2d13dd8); 972 } 973 974 /// Starting with the initial value in `crc`, accumulates a CRC32 value 975 /// for unsigned 8-bit integer `v`. 976 /// Warning: this is computing CRC-32C (Castagnoli), not CRC-32. 977 uint _mm_crc32_u8 (uint crc, ubyte v) @safe 978 { 979 static if (GDC_with_SSE42) 980 { 981 return __builtin_ia32_crc32qi(crc, v); 982 } 983 else static if (LDC_with_CRC32) 984 { 985 return __builtin_ia32_crc32qi(crc, v); 986 } 987 else static if (LDC_with_ARM64_CRC) 988 { 989 return __crc32cb(crc, v); 990 } 991 else 992 { 993 return CRC32cTable[(crc ^ v) & 0xFF] ^ (crc >> 8); 994 } 995 } 996 unittest 997 { 998 uint A = _mm_crc32_u8(0x12345678, 0x45); 999 uint B = _mm_crc32_u8(0x76543210, 0xf5); 1000 uint C = _mm_crc32_u8(0xDEADBEEF, 0x00); 1001 assert(A == 0x8fd93134); 1002 assert(B == 0xd6b7e834); 1003 assert(C == 0xbdfd3980); 1004 } 1005 1006 1007 // Utilities for this file 1008 1009 private: 1010 1011 static if (GDC_with_SSE42) 1012 { 1013 version(X86_64) 1014 enum bool NeedCRC32CTable = false; 1015 else 1016 enum bool NeedCRC32CTable = true; 1017 } 1018 else static if (LDC_with_CRC32) 1019 { 1020 version(X86_64) 1021 enum bool NeedCRC32CTable = false; 1022 else 1023 enum bool NeedCRC32CTable = true; 1024 } 1025 else static if (LDC_with_ARM64_CRC) 1026 { 1027 enum bool NeedCRC32CTable = false; 1028 } 1029 else 1030 { 1031 enum bool NeedCRC32CTable = true; 1032 } 1033 1034 static if (NeedCRC32CTable) 1035 { 1036 static immutable uint[256] CRC32cTable = 1037 [ 1038 0x0, 0xf26b8303, 0xe13b70f7, 0x1350f3f4, 0xc79a971f, 0x35f1141c, 0x26a1e7e8, 0xd4ca64eb, 1039 0x8ad958cf, 0x78b2dbcc, 0x6be22838, 0x9989ab3b, 0x4d43cfd0, 0xbf284cd3, 0xac78bf27, 0x5e133c24, 1040 0x105ec76f, 0xe235446c, 0xf165b798, 0x30e349b, 0xd7c45070, 0x25afd373, 0x36ff2087, 0xc494a384, 1041 0x9a879fa0, 0x68ec1ca3, 0x7bbcef57, 0x89d76c54, 0x5d1d08bf, 0xaf768bbc, 0xbc267848, 0x4e4dfb4b, 1042 0x20bd8ede, 0xd2d60ddd, 0xc186fe29, 0x33ed7d2a, 0xe72719c1, 0x154c9ac2, 0x61c6936, 0xf477ea35, 1043 0xaa64d611, 0x580f5512, 0x4b5fa6e6, 0xb93425e5, 0x6dfe410e, 0x9f95c20d, 0x8cc531f9, 0x7eaeb2fa, 1044 0x30e349b1, 0xc288cab2, 0xd1d83946, 0x23b3ba45, 0xf779deae, 0x5125dad, 0x1642ae59, 0xe4292d5a, 1045 0xba3a117e, 0x4851927d, 0x5b016189, 0xa96ae28a, 0x7da08661, 0x8fcb0562, 0x9c9bf696, 0x6ef07595, 1046 0x417b1dbc, 0xb3109ebf, 0xa0406d4b, 0x522bee48, 0x86e18aa3, 0x748a09a0, 0x67dafa54, 0x95b17957, 1047 0xcba24573, 0x39c9c670, 0x2a993584, 0xd8f2b687, 0xc38d26c, 0xfe53516f, 0xed03a29b, 0x1f682198, 1048 0x5125dad3, 0xa34e59d0, 0xb01eaa24, 0x42752927, 0x96bf4dcc, 0x64d4cecf, 0x77843d3b, 0x85efbe38, 1049 0xdbfc821c, 0x2997011f, 0x3ac7f2eb, 0xc8ac71e8, 0x1c661503, 0xee0d9600, 0xfd5d65f4, 0xf36e6f7, 1050 0x61c69362, 0x93ad1061, 0x80fde395, 0x72966096, 0xa65c047d, 0x5437877e, 0x4767748a, 0xb50cf789, 1051 0xeb1fcbad, 0x197448ae, 0xa24bb5a, 0xf84f3859, 0x2c855cb2, 0xdeeedfb1, 0xcdbe2c45, 0x3fd5af46, 1052 0x7198540d, 0x83f3d70e, 0x90a324fa, 0x62c8a7f9, 0xb602c312, 0x44694011, 0x5739b3e5, 0xa55230e6, 1053 0xfb410cc2, 0x92a8fc1, 0x1a7a7c35, 0xe811ff36, 0x3cdb9bdd, 0xceb018de, 0xdde0eb2a, 0x2f8b6829, 1054 0x82f63b78, 0x709db87b, 0x63cd4b8f, 0x91a6c88c, 0x456cac67, 0xb7072f64, 0xa457dc90, 0x563c5f93, 1055 0x82f63b7, 0xfa44e0b4, 0xe9141340, 0x1b7f9043, 0xcfb5f4a8, 0x3dde77ab, 0x2e8e845f, 0xdce5075c, 1056 0x92a8fc17, 0x60c37f14, 0x73938ce0, 0x81f80fe3, 0x55326b08, 0xa759e80b, 0xb4091bff, 0x466298fc, 1057 0x1871a4d8, 0xea1a27db, 0xf94ad42f, 0xb21572c, 0xdfeb33c7, 0x2d80b0c4, 0x3ed04330, 0xccbbc033, 1058 0xa24bb5a6, 0x502036a5, 0x4370c551, 0xb11b4652, 0x65d122b9, 0x97baa1ba, 0x84ea524e, 0x7681d14d, 1059 0x2892ed69, 0xdaf96e6a, 0xc9a99d9e, 0x3bc21e9d, 0xef087a76, 0x1d63f975, 0xe330a81, 0xfc588982, 1060 0xb21572c9, 0x407ef1ca, 0x532e023e, 0xa145813d, 0x758fe5d6, 0x87e466d5, 0x94b49521, 0x66df1622, 1061 0x38cc2a06, 0xcaa7a905, 0xd9f75af1, 0x2b9cd9f2, 0xff56bd19, 0xd3d3e1a, 0x1e6dcdee, 0xec064eed, 1062 0xc38d26c4, 0x31e6a5c7, 0x22b65633, 0xd0ddd530, 0x417b1db, 0xf67c32d8, 0xe52cc12c, 0x1747422f, 1063 0x49547e0b, 0xbb3ffd08, 0xa86f0efc, 0x5a048dff, 0x8ecee914, 0x7ca56a17, 0x6ff599e3, 0x9d9e1ae0, 1064 0xd3d3e1ab, 0x21b862a8, 0x32e8915c, 0xc083125f, 0x144976b4, 0xe622f5b7, 0xf5720643, 0x7198540, 1065 0x590ab964, 0xab613a67, 0xb831c993, 0x4a5a4a90, 0x9e902e7b, 0x6cfbad78, 0x7fab5e8c, 0x8dc0dd8f, 1066 0xe330a81a, 0x115b2b19, 0x20bd8ed, 0xf0605bee, 0x24aa3f05, 0xd6c1bc06, 0xc5914ff2, 0x37faccf1, 1067 0x69e9f0d5, 0x9b8273d6, 0x88d28022, 0x7ab90321, 0xae7367ca, 0x5c18e4c9, 0x4f48173d, 0xbd23943e, 1068 0xf36e6f75, 0x105ec76, 0x12551f82, 0xe03e9c81, 0x34f4f86a, 0xc69f7b69, 0xd5cf889d, 0x27a40b9e, 1069 0x79b737ba, 0x8bdcb4b9, 0x988c474d, 0x6ae7c44e, 0xbe2da0a5, 0x4c4623a6, 0x5f16d052, 0xad7d5351, 1070 ]; 1071 } 1072 1073 int findLengthByte(__m128i a) pure @safe 1074 { 1075 const __m128i zero = _mm_setzero_si128(); 1076 const __m128i zeroMask = _mm_cmpeq_epi8(a, zero); // 0xff where a byte is zero 1077 int mask = _mm_movemask_epi8(zeroMask); // the lowest set bit is the zero index 1078 if (mask == 0) 1079 return 16; 1080 else 1081 return bsf(mask); 1082 } 1083 unittest 1084 { 1085 char[16] A = "Hel!o"; 1086 char[16] B = "Maximum length!!"; 1087 __m128i mmA = _mm_loadu_si128(cast(__m128i*)A.ptr); 1088 __m128i mmB = _mm_loadu_si128(cast(__m128i*)B.ptr); 1089 assert(findLengthByte(mmA) == 5); 1090 assert(findLengthByte(mmB) == 16); 1091 } 1092 1093 int findLengthShort(__m128i a) pure @safe 1094 { 1095 const __m128i zero = _mm_setzero_si128(); 1096 const __m128i zeroMask = _mm_cmpeq_epi16(a, zero); // 0xffff where a short is zero 1097 int mask = _mm_movemask_epi8(zeroMask); // the lowest set bit is the zero index 1098 if (mask == 0) 1099 return 8; 1100 else 1101 return bsf(mask) >> 1; 1102 } 1103 unittest 1104 { 1105 short[8] A = [10, 5423, 475, 0, 1, 1, 1, 1 ]; 1106 short[8] B = [-1, -2, -3, 4, 5, 6, -32768, 1]; 1107 __m128i mmA = _mm_loadu_si128(cast(__m128i*)A.ptr); 1108 __m128i mmB = _mm_loadu_si128(cast(__m128i*)B.ptr); 1109 assert(findLengthShort(mmA) == 3); 1110 assert(findLengthShort(mmB) == 8); 1111 } 1112 1113 static immutable byte[32] MASK_DATA = 1114 [ 1115 -1, -1, -1, -1, -1, -1, -1, -1, 1116 -1, -1, -1, -1, -1, -1, -1, -1, 1117 0, 0, 0, 0, 0, 0, 0, 0, 1118 0, 0, 0, 0, 0, 0, 0, 0, 1119 ]; 1120 1121 // Makes a byte validity mask with a given explicit length string. 1122 __m128i validMask8e(int len) @trusted 1123 { 1124 return _mm_loadu_si128(cast(__m128i*) &MASK_DATA[16-len]); 1125 } 1126 unittest 1127 { 1128 char[16] A = ""; 1129 char[16] B = "0123456789abcdef"; 1130 byte[16] correctA = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]; 1131 byte[16] correctB = [-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1]; 1132 byte16 MA = cast(byte16) validMask8e(0); 1133 byte16 MB = cast(byte16) validMask8e(16); 1134 assert(MA.array == correctA); 1135 assert(MB.array == correctB); 1136 } 1137 1138 // Makes a short validity mask with a given explicit length string. 1139 __m128i validMask16e(int len) @trusted 1140 { 1141 return _mm_loadu_si128(cast(__m128i*) &MASK_DATA[16-len*2]); 1142 } 1143 unittest 1144 { 1145 short[8] A = [3, 4, 5, 0, 3, 4, 5, 6]; 1146 short[8] correctA = [-1, -1, -1, 0, 0, 0, 0, 0]; 1147 short8 MA = cast(short8) validMask16e(3); 1148 assert(MA.array == correctA); 1149 } 1150 1151 // Internal implementation for non-SSE4.2 1152 // Compare 8-bit or 16-bit strings, get a mask. 1153 // `aValid` and `bValid` are byte-mask or word-mask of the valid 1154 // zone in `a` and `b`. 1155 __m128i cmpstrMaskExplicit(int imm8)(__m128i a, 1156 ref int la, 1157 __m128i b, 1158 ref int lb) @safe 1159 { 1160 // saturates lengths (the Intrinsics Guide doesn't tell this) 1161 if (la < 0) la = -la; 1162 if (lb < 0) lb = -lb; 1163 if (la > 16) la = 16; 1164 if (lb > 16) lb = 16; 1165 1166 static if (imm8 & 1) 1167 { 1168 __m128i aValid = validMask16e(la); 1169 __m128i bValid = validMask16e(lb); 1170 } 1171 else 1172 { 1173 __m128i aValid = validMask8e(la); 1174 __m128i bValid = validMask8e(lb); 1175 } 1176 return cmpstrMask!imm8(a, aValid, b, bValid); 1177 } 1178 1179 //ditto 1180 __m128i cmpstrMask(int imm8)(__m128i a, 1181 __m128i aValid, 1182 __m128i b, 1183 const __m128i bValid) @safe 1184 { 1185 enum bool chars16Bits = imm8 & 1; 1186 enum int Mode = (imm8 >> 2) & 3; 1187 1188 static if (Mode == 0) // equal any 1189 { 1190 __m128i R = _mm_setzero_si128(); 1191 static if (chars16Bits) // 64 comparisons 1192 { 1193 for (int k = 0; k < 8; ++k) 1194 { 1195 __m128i equalMask = _mm_cmpeq_epi16(a, b); 1196 equalMask = _mm_and_si128(equalMask, aValid); 1197 R = _mm_or_si128(R, equalMask); 1198 1199 // rotate a and aValid 1200 a = _mm_or_si128(_mm_srli_si128!2(a), _mm_slli_si128!14(a)); 1201 aValid = _mm_or_si128(_mm_srli_si128!2(aValid), _mm_slli_si128!14(aValid)); 1202 } 1203 } 1204 else 1205 { 1206 for (int k = 0; k < 16; ++k) 1207 { 1208 __m128i equalMask = _mm_cmpeq_epi8(a, b); 1209 equalMask = _mm_and_si128(equalMask, aValid); 1210 R = _mm_or_si128(R, equalMask); 1211 1212 // rotate a and aValid 1213 a = _mm_or_si128(_mm_srli_si128!1(a), _mm_slli_si128!15(a)); 1214 aValid = _mm_or_si128(_mm_srli_si128!1(aValid), _mm_slli_si128!15(aValid)); 1215 } 1216 } 1217 R = _mm_and_si128(R, bValid); 1218 } 1219 else static if (Mode == 1) // ranges 1220 { 1221 enum bool signed = (imm8 & 2) != 0; 1222 1223 // For each character in b, the returned mask says if it was found in a range-pair in `a`. 1224 __m128i R = _mm_setzero_si128(); 1225 static if (chars16Bits) 1226 { 1227 for (int pos = 0; pos < 8; pos += 2) 1228 { 1229 short min = (cast(short8)a).array[pos]; 1230 short max = (cast(short8)a).array[pos+1]; 1231 static if (signed) 1232 { 1233 __m128i ge = ~_mm_cmplt_epi16(b, _mm_set1_epi16(min)); 1234 __m128i le = ~_mm_cmpgt_epi16(b, _mm_set1_epi16(max)); 1235 } 1236 else 1237 { 1238 // No SSE way to do 16-bit unsigned comparisons, 1239 // but flipping the sign bit let us used signed comp 1240 __m128i firstBits = _mm_set1_epi16(-32768); 1241 __m128i reverseB = _mm_xor_si128(b, firstBits); 1242 __m128i reverseMin = _mm_xor_si128(_mm_set1_epi16(min), firstBits); 1243 __m128i reverseMax = _mm_xor_si128(_mm_set1_epi16(max), firstBits); 1244 __m128i ge = ~_mm_cmplt_epi16(reverseB, reverseMin); 1245 __m128i le = ~_mm_cmpgt_epi16(reverseB, reverseMax); 1246 } 1247 __m128i inRange = _mm_and_si128(le, ge); 1248 1249 // Not considered in range a is invalid here. 1250 short aValidHere = (cast(short8)aValid).array[pos+1]; 1251 __m128i mmAValidHere = _mm_set1_epi16(aValidHere); 1252 inRange = _mm_and_si128(inRange, mmAValidHere); 1253 1254 R = _mm_or_si128(R, inRange); 1255 } 1256 } 1257 else // 8-bits 1258 { 1259 for (int pos = 0; pos < 16; pos += 2) 1260 { 1261 byte min = (cast(byte16)a).array[pos]; 1262 byte max = (cast(byte16)a).array[pos+1]; 1263 static if (signed) 1264 { 1265 __m128i ge = _mm_xor_si128(_mm_cmplt_epi8(b, _mm_set1_epi8(min))); 1266 __m128i le = _mm_xor_si128(_mm_cmpgt_epi8(b, _mm_set1_epi8(max))); 1267 } 1268 else 1269 { 1270 // No SSE way to do 16-bit unsigned comparisons, 1271 // but flipping the sign bit let us used signed comp 1272 __m128i firstBits = _mm_set1_epi8(-128); 1273 __m128i reverseB = _mm_xor_si128(b, firstBits); 1274 __m128i reverseMin = _mm_xor_si128(_mm_set1_epi8(min), firstBits); 1275 __m128i reverseMax = _mm_xor_si128(_mm_set1_epi8(max), firstBits); 1276 __m128i ge = ~_mm_cmplt_epi8(reverseB, reverseMin); 1277 __m128i le = ~_mm_cmpgt_epi8(reverseB, reverseMax); 1278 } 1279 __m128i inRange = _mm_and_si128(le, ge); 1280 1281 // Not considered in range a is invalid here. 1282 byte aValidHere = (cast(byte16)aValid).array[pos+1]; 1283 __m128i mmAValidHere = _mm_set1_epi8(aValidHere); 1284 inRange = _mm_and_si128(inRange, mmAValidHere); 1285 1286 R = _mm_or_si128(R, inRange); 1287 } 1288 } 1289 // invalid b part is not in range 1290 R = _mm_and_si128(R, bValid); 1291 } 1292 else static if (Mode == 2) // equal each, just 16 comparisons not 256 1293 { 1294 static if (chars16Bits) 1295 { 1296 __m128i R = _mm_cmpeq_epi16(a, b); 1297 } 1298 else 1299 { 1300 __m128i R = _mm_cmpeq_epi8(a, b); 1301 } 1302 1303 // if only a or b is invalid, consider not equal 1304 R = _mm_andnot_si128(_mm_xor_si128(aValid, bValid), R); 1305 1306 // if a and b are both invalid, consider equal 1307 R = _mm_or_si128(R, ~_mm_or_si128(aValid, bValid)); 1308 } 1309 else static if (Mode == 3) // equal ordered 1310 { 1311 // a is searched in b. 1312 1313 __m128i bValidShift = bValid; 1314 1315 __m128i R = _mm_set1_epi32(-1); // all b positions possible for containing a 1316 static if (chars16Bits) 1317 { 1318 for (int pos = 0; pos < 8; ++pos) 1319 { 1320 // compare character k of a, where can it go in b? 1321 short charK = (cast(short8)a).array[pos]; 1322 __m128i mmcharK = _mm_set1_epi16(charK); 1323 1324 short aValidHere = (cast(short8)aValid).array[pos]; 1325 __m128i mmAValidHere = _mm_set1_epi16(aValidHere); 1326 __m128i mmAInvalidHere = _mm_xor_si128(mmAValidHere, _mm_set1_epi32(-1)); 1327 __m128i equalMask = _mm_cmpeq_epi16(mmcharK, b); 1328 1329 // Where A is invalid, the comparison always holds "equal" 1330 equalMask = _mm_or_si128(equalMask, mmAInvalidHere); 1331 1332 // Where B is invalid, and A is valid, the comparison is forced to false 1333 equalMask = _mm_and_si128(equalMask, _mm_or_si128(bValidShift, mmAInvalidHere)); 1334 1335 R = _mm_and_si128(equalMask); 1336 1337 // drop first char of b 1338 b = _mm_srli_si128!2(b); 1339 bValidShift = _mm_srli_si128!2(bValidShift); 1340 } 1341 } 1342 else 1343 { 1344 for (int pos = 0; pos < 16; ++pos) 1345 { 1346 // compare character k of a, where can it go in b? 1347 byte charK = (cast(byte16)a).array[pos]; 1348 __m128i mmcharK = _mm_set1_epi8(charK); 1349 1350 byte aValidHere = (cast(byte16)aValid).array[pos]; 1351 __m128i mmAValidHere = _mm_set1_epi8(aValidHere); 1352 __m128i mmAInvalidHere = _mm_xor_si128(mmAValidHere, _mm_set1_epi32(-1)); 1353 __m128i equalMask = _mm_cmpeq_epi8(mmcharK, b); 1354 1355 // Where A is invalid, the comparison always holds "equal" 1356 equalMask = _mm_or_si128(equalMask, mmAInvalidHere); 1357 1358 // Where B is invalid, and A is valid, the comparison is forced to false 1359 equalMask = _mm_and_si128(equalMask, _mm_or_si128(bValidShift, mmAInvalidHere)); 1360 1361 R = _mm_and_si128(R, equalMask); 1362 1363 // drop first char of b 1364 b = _mm_srli_si128!1(b); 1365 bValidShift = _mm_srli_si128!1(bValidShift); 1366 } 1367 } 1368 } 1369 else 1370 static assert(0); 1371 1372 // Optionally negate result 1373 static if (imm8 & _SIDD_NEGATIVE_POLARITY) 1374 { 1375 static if (imm8 & _SIDD_MASKED_POSITIVE_POLARITY) 1376 { 1377 R = _mm_xor_si128(R, bValid); // only negate valid b 1378 } 1379 else 1380 { 1381 R = _mm_xor_si128(R, _mm_set1_epi32(-1)); // negate all 1382 } 1383 } 1384 return R; 1385 }