1 /** 2 * SSE4.2 intrinsics. 3 * https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#techs=SSSE3 4 * 5 * Copyright: Guillaume Piolat 2022. 6 * License: $(LINK2 http://www.boost.org/LICENSE_1_0.txt, Boost License 1.0) 7 */ 8 module inteli.nmmintrin; 9 10 public import inteli.types; 11 import inteli.internals; 12 public import inteli.smmintrin; 13 import core.bitop: bsf, bsr; 14 15 16 // Note: this header will work whether you have SSE4.2 enabled or not. 17 // With LDC, use "dflags-ldc": ["-mattr=+sse4.2"] or equivalent to actively 18 // generate SSE4.2 instruction (they are often enabled with -O1 or greater). 19 // Additionally, you need ["-mattr=+crc"] on ARM if you want hardware CRC instructions. 20 21 nothrow @nogc: 22 23 // <Data size and signedness> 24 25 /// String contains unsigned 8-bit characters (default). 26 enum int _SIDD_UBYTE_OPS = 0; 27 28 /// String contains unsigned 16-bit characters. 29 enum int _SIDD_UWORD_OPS = 1; 30 31 /// String contains signed 8-bit characters. 32 enum int _SIDD_SBYTE_OPS = 2; 33 34 /// String contains signed 16-bit characters. 35 enum int _SIDD_SWORD_OPS = 3; 36 37 // </Data size and signedness> 38 39 40 // <Comparison options> 41 42 /// For each character in `b`, find if it is in `a` (default) 43 /// The resulting mask has bit set at b positions that were found in a. 44 enum int _SIDD_CMP_EQUAL_ANY = 0; 45 46 /// For each character in `b`, determine if 47 /// `a[0] <= c <= a[1] or a[1] <= c <= a[2]...` 48 /// Contrarily to false documentation on the Internet, pairs must be in `a`! 49 enum int _SIDD_CMP_RANGES = 4; 50 51 /// The strings defined by `a` and `b` are equal 52 enum int _SIDD_CMP_EQUAL_EACH = 8; 53 54 /// Search for the defined substring in the target 55 enum int _SIDD_CMP_EQUAL_ORDERED = 12; 56 57 // </Comparison options> 58 59 // <Result polarity> 60 61 /// Do not negate results (default, no effect) 62 enum int _SIDD_POSITIVE_POLARITY = 0; 63 64 /// Negates results 65 enum int _SIDD_NEGATIVE_POLARITY = 16; 66 67 /// No effect. Do not negate results before the end of the string. (default when using `_SIDD_NEGATIVE_POLARITY`) 68 /// You basically never want this. 69 enum int _SIDD_MASKED_POSITIVE_POLARITY = 32; 70 71 /// Negates results only before the end of the string 72 enum int _SIDD_MASKED_NEGATIVE_POLARITY = 48; 73 74 // </Result polarity> 75 76 // <Bit returned> 77 78 /// **Index only**: return the least significant bit (default). 79 enum int _SIDD_LEAST_SIGNIFICANT = 0; 80 81 /// **Index only**: return the most significant bit. 82 enum int _SIDD_MOST_SIGNIFICANT = 64; 83 84 // </Bit returned> 85 86 /// **Mask only**: return the bit mask (default). 87 enum int _SIDD_BIT_MASK = 0; 88 89 /// **Mask only**: return the byte/word mask. 90 enum int _SIDD_UNIT_MASK = 64; 91 92 /// So SSE4.2 has a lot of hard-to-understand instructions. Here is another explanations. 93 /// 94 /// Alternative explanation of imm8 95 /// 96 /// imm8 is an 8-bit immediate operand specifying whether the characters are bytes or 97 /// words and the type of comparison to do. 98 /// 99 /// Bits [1:0]: Determine source data format. 100 /// 00: 16 unsigned bytes 101 /// 01: 8 unsigned words 102 /// 10: 16 signed bytes 103 /// 11: 8 signed words 104 /// 105 /// Bits [3:2]: Determine comparison type and aggregation method. 106 /// 00: Subset: Each character in B is compared for equality with all 107 /// the characters in A. 108 /// 01: Ranges: Each character in B is compared to A pairs. The comparison 109 /// basis is greater than or equal for even-indexed elements in A, 110 /// and less than or equal for odd-indexed elements in A. 111 /// 10: Match: Compare each pair of corresponding characters in A and 112 /// B for equality. 113 /// 11: Substring: Search B for substring matches of A. 114 /// 115 /// Bits [5:4]: Determine whether to do a one's complement on the bit 116 /// mask of the comparison results. \n 117 /// 00: No effect. \n 118 /// 01: Negate the bit mask. \n 119 /// 10: No effect. \n 120 /// 11: Negate the bit mask only for bits with an index less than or equal 121 /// to the size of \a A or \a B. 122 /// 123 124 125 126 /// Compare packed strings in `a` and `b` with lengths `la` and `lb` using 127 /// the control in `imm8`, and returns 1 if `b` "does not contain a null character" 128 /// and the resulting mask was zero, and 0 otherwise. 129 /// Warning: actually it seems the instruction does accept \0 in input, just the length must be >= count. 130 /// It's not clear for what purpose. 131 int _mm_cmpestra(int imm8)(__m128i a, int la, __m128i b, int lb) @trusted 132 { 133 static if (GDC_with_SSE42) 134 { 135 return cast(int) __builtin_ia32_pcmpestria128(cast(ubyte16)a, la, cast(ubyte16)b, lb, imm8); 136 } 137 else static if (LDC_with_SSE42) 138 { 139 return __builtin_ia32_pcmpestria128(cast(byte16)a, la, cast(byte16)b, lb, imm8); 140 } 141 else 142 { 143 __m128i mask = cmpstrMaskExplicit!imm8(a, la, b, lb); 144 __m128i equalZero = _mm_cmpeq_epi8(mask, _mm_setzero_si128()); 145 int sigbits = _mm_movemask_epi8(equalZero); 146 enum int Count = (imm8 & 1) ? 8 : 16; 147 return (sigbits == 0xffff) && (lb >= Count); 148 } 149 } 150 unittest 151 { 152 char[16] A = "Maximum\x00length!!"; 153 char[16] B = "Mbximum\x00length!!"; 154 __m128i mmA = _mm_loadu_si128(cast(__m128i*)A.ptr); 155 __m128i mmB = _mm_loadu_si128(cast(__m128i*)B.ptr); 156 157 // string matching a-la strcmp, for 16-bytes of data 158 // Use _SIDD_NEGATIVE_POLARITY since mask must be null, and all match must be one 159 assert(1 == _mm_cmpestra!(_SIDD_UBYTE_OPS 160 | _SIDD_CMP_EQUAL_EACH 161 | _SIDD_NEGATIVE_POLARITY)(mmA, 16, mmA, 16)); 162 assert(0 == _mm_cmpestra!(_SIDD_UBYTE_OPS 163 | _SIDD_CMP_EQUAL_EACH 164 | _SIDD_NEGATIVE_POLARITY)(mmA, 16, mmB, 16)); 165 166 // test negative length, this will be clamped to 16 167 assert(1 == _mm_cmpestra!(_SIDD_UBYTE_OPS 168 | _SIDD_CMP_EQUAL_EACH 169 | _SIDD_NEGATIVE_POLARITY)(mmA, -160, mmA, -17)); 170 171 // it seems you can't compare shorter strings for equality using _mm_cmpestra (!) 172 173 // Test 16-bit format 174 assert(1 == _mm_cmpestra!(_SIDD_SWORD_OPS 175 | _SIDD_CMP_EQUAL_EACH 176 | _SIDD_NEGATIVE_POLARITY)(mmA, 8, mmA, 8)); 177 } 178 179 /// Compare packed strings in `a` and `b` with lengths `la` and `lb` using 180 /// the control in `imm8`, and returns 1 if the resulting mask was non-zero, 181 /// and 0 otherwise. 182 int _mm_cmpestrc(int imm8)(__m128i a, int la, __m128i b, int lb) @trusted 183 { 184 static if (GDC_with_SSE42) 185 { 186 return cast(int) __builtin_ia32_pcmpestric128(cast(ubyte16)a, la, cast(ubyte16)b, lb, imm8); 187 } 188 else static if (LDC_with_SSE42) 189 { 190 return cast(int) __builtin_ia32_pcmpestric128(cast(byte16)a, la, cast(byte16)b, lb, imm8); 191 } 192 else 193 { 194 __m128i mask = cmpstrMaskExplicit!imm8(a, la, b, lb); 195 int sigbits = _mm_movemask_epi8(mask); 196 return (sigbits != 0); 197 } 198 } 199 unittest 200 { 201 // Compare two shorter strings 202 { 203 char[16] A = "Hello world"; 204 char[16] B = "Hello moon"; 205 __m128i mmA = _mm_loadu_si128(cast(__m128i*)A.ptr); 206 __m128i mmB = _mm_loadu_si128(cast(__m128i*)B.ptr); 207 __m128i mask = _mm_cmpestrm!(_SIDD_UBYTE_OPS // match gives 0 like strcmp 208 | _SIDD_CMP_EQUAL_EACH 209 | _SIDD_NEGATIVE_POLARITY)(mmA, 6, mmB, 6); 210 assert(0 == _mm_cmpestrc!(_SIDD_UBYTE_OPS // match gives 0 like strcmp 211 | _SIDD_CMP_EQUAL_EACH 212 | _SIDD_NEGATIVE_POLARITY)(mmA, 6, mmB, 6)); 213 assert(1 == _mm_cmpestrc!(_SIDD_UBYTE_OPS 214 | _SIDD_CMP_EQUAL_EACH 215 | _SIDD_NEGATIVE_POLARITY)(mmA, 7, mmB, 7)); 216 } 217 } 218 219 /// Compare packed strings in `a` and `b` with lengths `la` and `lb` using 220 /// the control in `imm8`, and return the generated index. 221 /// Note: if the mask is all zeroes, the returned index is always `Count` 222 /// (8 or 16 depending on size). 223 int _mm_cmpestri(int imm8)(__m128i a, int la, __m128i b, int lb) @trusted 224 { 225 static if (GDC_with_SSE42) 226 { 227 return __builtin_ia32_pcmpestri128(cast(ubyte16)a, la, cast(ubyte16)b, lb, imm8); 228 } 229 else static if (LDC_with_SSE42) 230 { 231 return __builtin_ia32_pcmpestri128(cast(byte16)a, la, cast(byte16)b, lb, imm8); 232 } 233 else 234 { 235 __m128i mask = cmpstrMaskExplicit!imm8(a, la, b, lb); 236 237 // Convert the unit mask to bit mask 238 static if (imm8 & 1) 239 { 240 enum int Count = 8; 241 mask = _mm_packs_epi16(mask, _mm_setzero_si128()); 242 } 243 else 244 { 245 enum int Count = 16; 246 } 247 int signbits = _mm_movemask_epi8(mask); 248 static if (imm8 & _SIDD_MOST_SIGNIFICANT) 249 { 250 if (signbits == 0) 251 return Count; 252 else 253 return bsr(signbits); 254 } 255 else 256 { 257 if (signbits == 0) 258 return Count; 259 else 260 return bsf(signbits); 261 } 262 } 263 } 264 unittest 265 { 266 // Find the index of the first difference (at index 6) 267 // v 268 char[16] A = "Hello sun"; 269 char[16] B = "Hello moon"; 270 271 __m128i mmA = _mm_loadu_si128(cast(__m128i*)A.ptr); 272 __m128i mmB = _mm_loadu_si128(cast(__m128i*)B.ptr); 273 274 int index = _mm_cmpestri!(_SIDD_UBYTE_OPS 275 | _SIDD_CMP_EQUAL_EACH 276 | _SIDD_NEGATIVE_POLARITY 277 | _SIDD_LEAST_SIGNIFICANT)(mmA, 9, mmB, 10); 278 assert(index == 6); 279 280 // Those string must compare equal, regardless of what happens after their length. 281 index = _mm_cmpestri!(_SIDD_UBYTE_OPS 282 | _SIDD_CMP_EQUAL_EACH 283 | _SIDD_NEGATIVE_POLARITY 284 | _SIDD_LEAST_SIGNIFICANT)(mmA, 6, mmB, 6); // only look first six chars 285 assert(index == 16); 286 287 index = _mm_cmpestri!(_SIDD_UBYTE_OPS 288 | _SIDD_CMP_EQUAL_EACH 289 | _SIDD_NEGATIVE_POLARITY 290 | _SIDD_MOST_SIGNIFICANT)(mmA, 6, mmB, 6); // only look first six chars 291 assert(index == 16); 292 } 293 unittest 294 { 295 // Identify the last character that isn't an identifier character. 296 // v (at index 7) 297 char[16] A = "my_i(en)ifie"; 298 char[16] identRanges = "__azAz09"; 299 __m128i mmA = _mm_loadu_si128(cast(__m128i*)A.ptr); 300 __m128i mmI = _mm_loadu_si128(cast(__m128i*)identRanges.ptr); 301 byte16 mask = cast(byte16)_mm_cmpestrm!(_SIDD_UBYTE_OPS 302 | _SIDD_CMP_RANGES 303 | _SIDD_MASKED_NEGATIVE_POLARITY 304 | _SIDD_UNIT_MASK)(mmI, 8, mmA, 12); 305 byte[16] correctM = [0, 0, 0, 0, -1, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0]; 306 assert(mask.array == correctM); 307 308 int index = _mm_cmpestri!(_SIDD_UBYTE_OPS 309 | _SIDD_CMP_RANGES 310 | _SIDD_MASKED_NEGATIVE_POLARITY 311 | _SIDD_MOST_SIGNIFICANT)(mmI, 8, mmA, 12); 312 assert(index == 7); // ')' is the last char not to be in [__azAz09] 313 } 314 unittest 315 { 316 // testing _SIDD_CMP_RANGES but with signed shorts comparison instead (this only makes sense for _SIDD_CMP_RANGES) 317 short[8] ranges = [0, -1, 1000, 2000, 0, 0, 0, 0]; 318 short[8] numbers = [-32768, -1000, -1, -0, 0, 1, 1000, 32767]; 319 __m128i mmRanges = _mm_loadu_si128(cast(__m128i*)ranges.ptr); 320 __m128i mmNumbers = _mm_loadu_si128(cast(__m128i*)numbers.ptr); 321 322 short8 mask = cast(short8)_mm_cmpestrm!(_SIDD_UWORD_OPS 323 | _SIDD_CMP_RANGES 324 | _SIDD_UNIT_MASK)(mmRanges, 4, mmNumbers, 8); 325 short[8] correctM = [ -1, -1, -1, -1, -1, -1, -1, -1]; 326 mask = cast(short8)_mm_cmpestrm!(_SIDD_SWORD_OPS 327 | _SIDD_CMP_RANGES 328 | _SIDD_UNIT_MASK)(mmRanges, 4, mmNumbers, 8); 329 short[8] correctZ = [ 0, 0, 0, 0, 0, 0, -1, 0]; 330 assert(mask.array == correctZ); 331 } 332 unittest 333 { 334 // Find a substring 335 char[16] A = "def"; 336 char[16] B = "abcdefghdefff"; 337 char[16] C = "no substring"; 338 __m128i mmA = _mm_loadu_si128(cast(__m128i*)A.ptr); 339 __m128i mmB = _mm_loadu_si128(cast(__m128i*)B.ptr); 340 __m128i mmC = _mm_loadu_si128(cast(__m128i*)C.ptr); 341 342 byte16 mask = cast(byte16)_mm_cmpestrm!(_SIDD_UBYTE_OPS 343 | _SIDD_CMP_EQUAL_ORDERED 344 | _SIDD_UNIT_MASK)(mmA, 3, mmB, 13); 345 byte[16] correctM = [0, 0, 0, -1, 0, 0, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0]; 346 assert(mask.array == correctM); 347 348 int firstMatch = _mm_cmpestri!(_SIDD_UBYTE_OPS 349 | _SIDD_CMP_EQUAL_ORDERED)(mmA, 3, mmB, 13); 350 assert(firstMatch == 3); 351 352 int lastMatch = _mm_cmpestri!(_SIDD_UBYTE_OPS 353 | _SIDD_CMP_EQUAL_ORDERED 354 | _SIDD_MOST_SIGNIFICANT)(mmA, 3, mmB, 13); 355 assert(lastMatch == 8); 356 firstMatch = _mm_cmpestri!(_SIDD_UBYTE_OPS 357 | _SIDD_CMP_EQUAL_ORDERED)(mmA, -3, mmC, -12); 358 assert(firstMatch == 16); // no substring found 359 } 360 361 /// Compare packed strings in `a` and `b` with lengths `la` and `lb` using 362 /// the control in `imm8`, and return the generated mask. 363 __m128i _mm_cmpestrm(int imm8)(__m128i a, int la, __m128i b, int lb) @trusted 364 { 365 static if (GDC_with_SSE42) 366 { 367 return cast(__m128i) __builtin_ia32_pcmpestrm128(cast(ubyte16)a, la, cast(ubyte16)b, lb, imm8); 368 } 369 else static if (LDC_with_SSE42) 370 { 371 return cast(__m128i) __builtin_ia32_pcmpestrm128(cast(byte16)a, la, cast(byte16)b, lb, imm8); 372 } 373 else 374 { 375 __m128i mask = cmpstrMaskExplicit!imm8(a, la, b, lb); 376 377 static if (imm8 & _SIDD_UNIT_MASK) 378 { 379 return mask; 380 } 381 else 382 { 383 // _SIDD_BIT_MASK 384 static if (imm8 & 1) 385 { 386 mask = _mm_packs_epi16(mask, _mm_setzero_si128()); 387 } 388 return _mm_cvtsi32_si128( _mm_movemask_epi8(mask)); 389 } 390 } 391 } 392 unittest 393 { 394 char[16] A = "Hello world!"; 395 char[16] B = "aeiou!"; 396 __m128i mmA = _mm_loadu_si128(cast(__m128i*)A.ptr); 397 __m128i mmB = _mm_loadu_si128(cast(__m128i*)B.ptr); 398 399 // Find which letters from B where found in A. 400 byte16 R = cast(byte16)_mm_cmpestrm!(_SIDD_UBYTE_OPS 401 | _SIDD_CMP_EQUAL_ANY 402 | _SIDD_BIT_MASK)(mmA, -12, mmB, -6); 403 // because 'e', 'o', and '!' were found 404 byte[16] correctR = [42, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]; 405 assert(R.array == correctR); 406 byte16 M = cast(byte16) _mm_cmpestrm!(_SIDD_UBYTE_OPS 407 | _SIDD_CMP_EQUAL_ANY 408 | _SIDD_UNIT_MASK)(mmA, 12, mmB, 6); 409 byte[16] correctM = [0, -1, 0, -1, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]; 410 assert(M.array == correctM); 411 } 412 413 /// Compare packed strings in `a` and `b` with lengths `la` and `lb` using 414 /// the control in `imm8`, and returns bit 0 of the resulting bit mask. 415 int _mm_cmpestro(int imm8)(__m128i a, int la, __m128i b, int lb) @trusted 416 { 417 static if (GDC_with_SSE42) 418 { 419 return __builtin_ia32_pcmpestrio128(cast(ubyte16)a, la, cast(ubyte16)b, lb, imm8); 420 } 421 else static if (LDC_with_SSE42) 422 { 423 return __builtin_ia32_pcmpestrio128(cast(byte16)a, la, cast(byte16)b, lb, imm8); 424 } 425 else 426 { 427 int4 mask = cast(int4) cmpstrMaskExplicit!imm8(a, la, b, lb); 428 return mask.array[0] & 1; 429 } 430 } 431 unittest 432 { 433 char[16] A = "Hallo world!"; 434 char[16] B = "aeiou!"; 435 __m128i mmA = _mm_loadu_si128(cast(__m128i*)A.ptr); 436 __m128i mmB = _mm_loadu_si128(cast(__m128i*)B.ptr); 437 438 // Find which letters from B where found in A. 439 int res = _mm_cmpestro!(_SIDD_UBYTE_OPS 440 | _SIDD_CMP_EQUAL_ANY 441 | _SIDD_BIT_MASK)(mmA, 12, mmB, -6); 442 // because 'a' was found in "Hallo world!" 443 assert(res == 1); 444 } 445 446 /// Returns 1 if "any character in a was null", and 0 otherwise. 447 /// Warning: what they mean is it returns 1 if the given length `la` is < Count. 448 int _mm_cmpestrs(int imm8)(__m128i a, int la, __m128i b, int lb) @trusted 449 { 450 static if (GDC_with_SSE42) 451 { 452 return __builtin_ia32_pcmpestris128(cast(ubyte16)a, la, cast(ubyte16)b, lb, imm8); 453 } 454 else static if (LDC_with_SSE42) 455 { 456 return __builtin_ia32_pcmpestris128(cast(byte16)a, la, cast(byte16)b, lb, imm8); 457 } 458 else 459 { 460 // Yes, this intrinsic is there for symmetrical reasons and probably useless. 461 // saturates lengths (the Intrinsics Guide doesn't tell this) 462 if (la < 0) la = -la; 463 if (la > 16) la = 16; 464 enum int Count = (imm8 & 1) ? 8 : 16; 465 return (la < Count); 466 } 467 } 468 unittest 469 { 470 __m128i a; 471 a = 0; 472 assert(_mm_cmpestrs!_SIDD_UBYTE_OPS(a, 15, a, 8) == 1); 473 assert(_mm_cmpestrs!_SIDD_UBYTE_OPS(a, 16, a, 8) == 0); 474 assert(_mm_cmpestrs!_SIDD_UBYTE_OPS(a, -15, a, 8) == 1); 475 assert(_mm_cmpestrs!_SIDD_UBYTE_OPS(a, -16, a, 8) == 0); 476 } 477 478 /// Returns 1 if "any character in b was null", and 0 otherwise. 479 /// Warning: what they mean is it returns 1 if the given length `lb` is < Count. 480 int _mm_cmpestrz(int imm8)(__m128i a, int la, __m128i b, int lb) @trusted 481 { 482 static if (GDC_with_SSE42) 483 { 484 return __builtin_ia32_pcmpestriz128(cast(ubyte16)a, la, cast(ubyte16)b, lb, imm8); 485 } 486 else static if (LDC_with_SSE42) 487 { 488 return __builtin_ia32_pcmpestriz128(cast(byte16)a, la, cast(byte16)b, lb, imm8); 489 } 490 else 491 { 492 // Yes, this intrinsic is there for symmetrical reasons and probably useless. 493 // saturates lengths (the Intrinsics Guide doesn't tell this) 494 if (lb < 0) lb = -lb; 495 if (lb > 16) lb = 16; 496 enum int Count = (imm8 & 1) ? 8 : 16; 497 return (lb < Count); 498 } 499 } 500 unittest 501 { 502 __m128i b; 503 b = 0; 504 assert(_mm_cmpestrs!_SIDD_UBYTE_OPS(b, 15, b, 15) == 1); 505 assert(_mm_cmpestrs!_SIDD_UBYTE_OPS(b, 16, b, 16) == 0); 506 assert(_mm_cmpestrs!_SIDD_UBYTE_OPS(b, -15, b, -15) == 1); 507 assert(_mm_cmpestrs!_SIDD_UBYTE_OPS(b, -16, b, -16) == 0); 508 } 509 510 /// Compare packed signed 64-bit integers in a and b for greater-than. 511 __m128i _mm_cmpgt_epi64 (__m128i a, __m128i b) @trusted 512 { 513 long2 la = cast(long2)a; 514 long2 lb = cast(long2)b; 515 static if (GDC_with_SSE42) 516 { 517 return cast(__m128i) __builtin_ia32_pcmpgtq(la, lb); 518 } 519 else version(LDC) 520 { 521 // LDC x86: Optimized since LDC 1.1.0 -O1 522 // arm64: Optimized since LDC 1.8.0 -O1 523 // When SSE4.2 is disabled, this gives same sequence than below. 524 return cast(__m128i)( greaterMask!long2(la, lb)); 525 } 526 else 527 { 528 long2 r; 529 r.ptr[0] = (la.array[0] > lb.array[0]) ? 0xffffffff_ffffffff : 0; 530 r.ptr[1] = (la.array[1] > lb.array[1]) ? 0xffffffff_ffffffff : 0; 531 return cast(__m128i)r; 532 } 533 } 534 unittest 535 { 536 __m128i A = _mm_setr_epi64(-3, 2); 537 __m128i B = _mm_setr_epi64(4, -2); 538 long[2] correct = [ 0, -1 ]; 539 long2 R = cast(long2)(_mm_cmpgt_epi32(A, B)); 540 assert(R.array == correct); 541 } 542 543 /// Compare packed strings with implicit lengths in `a` and `b` using the control in `imm8`, 544 /// and returns 1 if `b` did not contain a null character and the resulting mask was zero, 545 /// and 0 otherwise. 546 int _mm_cmpistra(int imm8)(__m128i a, __m128i b) @trusted 547 { 548 static if (GDC_with_SSE42) 549 { 550 return cast(int) __builtin_ia32_pcmpistria128(cast(ubyte16)a, cast(ubyte16)b, imm8); 551 } 552 else static if (LDC_with_SSE42) 553 { 554 return __builtin_ia32_pcmpistria128(cast(byte16)a, cast(byte16)b, imm8); 555 } 556 else 557 { 558 static if (imm8 & 1) 559 { 560 int la = findLengthShort(a); 561 int lb = findLengthShort(b); 562 } 563 else 564 { 565 int la = findLengthByte(a); 566 int lb = findLengthByte(b); 567 } 568 return _mm_cmpestra!imm8(a, la, b, lb); 569 } 570 } 571 unittest 572 { 573 char[16] A = "Maximum\x00one"; 574 char[16] B = "Maximum\x00four"; 575 char[16] C = "Mbximum\x00length!"; 576 __m128i mmA = _mm_loadu_si128(cast(__m128i*)A.ptr); 577 __m128i mmB = _mm_loadu_si128(cast(__m128i*)B.ptr); 578 __m128i mmC = _mm_loadu_si128(cast(__m128i*)C.ptr); 579 580 // string matching a-la strcmp, for 16-bytes of data 581 // Use _SIDD_NEGATIVE_POLARITY since mask must be null, and all match must be one 582 assert(0 == _mm_cmpistra!(_SIDD_UBYTE_OPS 583 | _SIDD_CMP_EQUAL_EACH 584 | _SIDD_MASKED_NEGATIVE_POLARITY)(mmA, mmB)); // match, but b is too short 585 586 assert(0 == _mm_cmpistra!(_SIDD_UBYTE_OPS 587 | _SIDD_CMP_EQUAL_EACH 588 | _SIDD_NEGATIVE_POLARITY)(mmA, mmC)); // do not match 589 } 590 591 /// Compare packed strings with implicit lengths in `a` and `b` using the control in `imm8`, 592 /// and returns 1 if the resulting mask was non-zero, and 0 otherwise. 593 int _mm_cmpistrc(int imm8)(__m128i a, __m128i b) @trusted 594 { 595 static if (GDC_with_SSE42) 596 { 597 return cast(int) __builtin_ia32_pcmpistric128(cast(ubyte16)a, cast(ubyte16)b, imm8); 598 } 599 else static if (LDC_with_SSE42) 600 { 601 return cast(int) __builtin_ia32_pcmpistric128(cast(byte16)a, cast(byte16)b, imm8); 602 } 603 else 604 { 605 static if (imm8 & 1) 606 { 607 int la = findLengthShort(a); 608 int lb = findLengthShort(b); 609 } 610 else 611 { 612 int la = findLengthByte(a); 613 int lb = findLengthByte(b); 614 } 615 return _mm_cmpestrc!imm8(a, la, b, lb); 616 } 617 } 618 unittest 619 { 620 // Compare two shorter strings 621 { 622 char[16] A = "Hello"; 623 char[16] B = "Hello moon"; 624 __m128i mmA = _mm_loadu_si128(cast(__m128i*)A.ptr); 625 __m128i mmB = _mm_loadu_si128(cast(__m128i*)B.ptr); 626 assert(0 == _mm_cmpistrc!(_SIDD_UBYTE_OPS // match gives 0 like strcmp 627 | _SIDD_CMP_EQUAL_EACH 628 | _SIDD_NEGATIVE_POLARITY)(mmA, mmA)); 629 assert(1 == _mm_cmpistrc!(_SIDD_UBYTE_OPS 630 | _SIDD_CMP_EQUAL_EACH 631 | _SIDD_NEGATIVE_POLARITY)(mmA, mmB)); 632 } 633 } 634 635 /// Compare packed strings with implicit lengths in `a` and `b` using the control in `imm8` 636 /// and return the generated index. 637 /// Note: if the mask is all zeroes, the returned index is always `Count` 638 /// (8 or 16 depending on size). 639 int _mm_cmpistri(int imm8)(__m128i a, __m128i b) @trusted 640 { 641 static if (GDC_with_SSE42) 642 { 643 return __builtin_ia32_pcmpistri128(cast(ubyte16)a, cast(ubyte16)b, imm8); 644 } 645 else static if (LDC_with_SSE42) 646 { 647 return __builtin_ia32_pcmpistri128(cast(byte16)a, cast(byte16)b, imm8); 648 } 649 else 650 { 651 static if (imm8 & 1) 652 { 653 int la = findLengthShort(a); 654 int lb = findLengthShort(b); 655 } 656 else 657 { 658 int la = findLengthByte(a); 659 int lb = findLengthByte(b); 660 } 661 return _mm_cmpestri!imm8(a, la, b, lb); 662 } 663 } 664 unittest 665 { 666 // Identify the last character that isn't an identifier character. 667 // v (at index 7) 668 char[16] A = "my_i(en)ifie"; 669 char[16] identRanges = "__azAz09"; 670 __m128i mmA = _mm_loadu_si128(cast(__m128i*)A.ptr); 671 __m128i mmI = _mm_loadu_si128(cast(__m128i*)identRanges.ptr); 672 byte16 mask = cast(byte16)_mm_cmpistrm!(_SIDD_UBYTE_OPS 673 | _SIDD_CMP_RANGES 674 | _SIDD_MASKED_NEGATIVE_POLARITY 675 | _SIDD_UNIT_MASK)(mmI, mmA); 676 byte[16] correctM = [0, 0, 0, 0, -1, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0]; 677 assert(mask.array == correctM); 678 679 int index = _mm_cmpistri!(_SIDD_UBYTE_OPS 680 | _SIDD_CMP_RANGES 681 | _SIDD_MASKED_NEGATIVE_POLARITY 682 | _SIDD_MOST_SIGNIFICANT)(mmI, mmA); 683 assert(index == 7); // ')' is the last char not to be in [__azAz09] 684 } 685 686 /// Compare packed strings with implicit lengths in `a` and `b` using the control in 687 /// `imm8`, and return the generated mask. 688 __m128i _mm_cmpistrm(int imm8)(__m128i a, __m128i b) @trusted 689 { 690 static if (GDC_with_SSE42) 691 { 692 return cast(__m128i) __builtin_ia32_pcmpistrm128(cast(ubyte16)a, cast(ubyte16)b, imm8); 693 } 694 else static if (LDC_with_SSE42) 695 { 696 return cast(__m128i) __builtin_ia32_pcmpistrm128(cast(byte16)a, cast(byte16)b, imm8); 697 } 698 else 699 { 700 static if (imm8 & 1) 701 { 702 int la = findLengthShort(a); 703 int lb = findLengthShort(b); 704 } 705 else 706 { 707 int la = findLengthByte(a); 708 int lb = findLengthByte(b); 709 } 710 return _mm_cmpestrm!imm8(a, la, b, lb); 711 } 712 } 713 unittest 714 { 715 char[16] A = "Hello world!"; 716 char[16] B = "aeiou!"; 717 __m128i mmA = _mm_loadu_si128(cast(__m128i*)A.ptr); 718 __m128i mmB = _mm_loadu_si128(cast(__m128i*)B.ptr); 719 720 // Find which letters from B where found in A. 721 byte16 R = cast(byte16)_mm_cmpistrm!(_SIDD_UBYTE_OPS 722 | _SIDD_CMP_EQUAL_ANY 723 | _SIDD_BIT_MASK)(mmA, mmB); 724 // because 'e', 'o', and '!' were found 725 byte[16] correctR = [42, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]; 726 assert(R.array == correctR); 727 byte16 M = cast(byte16) _mm_cmpistrm!(_SIDD_UBYTE_OPS 728 | _SIDD_CMP_EQUAL_ANY 729 | _SIDD_UNIT_MASK)(mmA, mmB); 730 byte[16] correctM = [0, -1, 0, -1, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]; 731 assert(M.array == correctM); 732 } 733 734 /// Compare packed strings in `a` and `b` with lengths `la` and `lb` using 735 /// the control in `imm8`, and returns bit 0 of the resulting bit mask. 736 int _mm_cmpistro(int imm8)(__m128i a, __m128i b) @trusted 737 { 738 static if (GDC_with_SSE42) 739 { 740 return __builtin_ia32_pcmpistrio128(cast(ubyte16)a, cast(ubyte16)b, imm8); 741 } 742 else static if (LDC_with_SSE42) 743 { 744 return __builtin_ia32_pcmpistrio128(cast(byte16)a, cast(byte16)b, imm8); 745 } 746 else 747 { 748 static if (imm8 & 1) 749 { 750 int la = findLengthShort(a); 751 int lb = findLengthShort(b); 752 } 753 else 754 { 755 int la = findLengthByte(a); 756 int lb = findLengthByte(b); 757 } 758 return _mm_cmpestro!imm8(a, la, b, lb); 759 } 760 } 761 unittest 762 { 763 char[16] A = "Hallo world!"; 764 char[16] B = "aeiou!"; 765 char[16] C = "Z"; 766 __m128i mmA = _mm_loadu_si128(cast(__m128i*)A.ptr); 767 __m128i mmB = _mm_loadu_si128(cast(__m128i*)B.ptr); 768 __m128i mmC = _mm_loadu_si128(cast(__m128i*)C.ptr); 769 770 // Find which letters from B where found in A. 771 int res = _mm_cmpistro!(_SIDD_UBYTE_OPS 772 | _SIDD_CMP_EQUAL_ANY 773 | _SIDD_BIT_MASK)(mmA, mmB); 774 // because 'a' was found in "Hallo world!" 775 assert(res == 1); 776 res = _mm_cmpistro!(_SIDD_UBYTE_OPS 777 | _SIDD_CMP_EQUAL_ANY 778 | _SIDD_BIT_MASK)(mmA, mmC); 779 assert(res == 0); // because 'Z' wasn't found in A 780 } 781 782 /// Returns 1 if any character in `a` was null, and 0 otherwise. 783 int _mm_cmpistrs(int imm8)(__m128i a, __m128i b) @trusted 784 { 785 static if (GDC_with_SSE42) 786 { 787 return __builtin_ia32_pcmpistris128(cast(ubyte16)a, cast(ubyte16)b, imm8); 788 } 789 else static if (LDC_with_SSE42) 790 { 791 return __builtin_ia32_pcmpistris128(cast(byte16)a, cast(byte16)b, imm8); 792 } 793 else 794 { 795 static if (imm8 & 1) 796 { 797 int la = findLengthShort(a); 798 return la != 8; 799 } 800 else 801 { 802 int la = findLengthByte(a); 803 return la != 16; 804 } 805 } 806 } 807 unittest 808 { 809 char[16] A = ""; 810 char[16] B = "hello"; 811 char[16] C = "Maximum length!!"; 812 __m128i mmA = _mm_loadu_si128(cast(__m128i*)A.ptr); 813 __m128i mmB = _mm_loadu_si128(cast(__m128i*)B.ptr); 814 __m128i mmC = _mm_loadu_si128(cast(__m128i*)C.ptr); 815 assert(_mm_cmpistrs!_SIDD_UBYTE_OPS(mmA, mmA) == 1); 816 assert(_mm_cmpistrs!_SIDD_SBYTE_OPS(mmB, mmB) == 1); 817 assert(_mm_cmpistrs!_SIDD_UWORD_OPS(mmC, mmC) == 0); 818 } 819 820 /// Returns 1 if any character in `b` was null, and 0 otherwise. 821 int _mm_cmpistrz(int imm8)(__m128i a, __m128i b) @trusted 822 { 823 static if (GDC_with_SSE42) 824 { 825 return __builtin_ia32_pcmpistriz128(cast(ubyte16)a, cast(ubyte16)b, imm8); 826 } 827 else static if (LDC_with_SSE42) 828 { 829 return __builtin_ia32_pcmpistriz128(cast(byte16)a, cast(byte16)b, imm8); 830 } 831 else 832 { 833 static if (imm8 & 1) 834 { 835 int lb = findLengthShort(b); 836 return lb != 8; 837 } 838 else 839 { 840 int lb = findLengthByte(b); 841 return lb != 16; 842 } 843 } 844 } 845 unittest 846 { 847 char[16] A = ""; 848 char[16] B = "hello"; 849 char[16] C = "Maximum length!!"; 850 __m128i mmA = _mm_loadu_si128(cast(__m128i*)A.ptr); 851 __m128i mmB = _mm_loadu_si128(cast(__m128i*)B.ptr); 852 __m128i mmC = _mm_loadu_si128(cast(__m128i*)C.ptr); 853 assert(_mm_cmpistrz!_SIDD_UBYTE_OPS(mmC, mmA) == 1); 854 assert(_mm_cmpistrz!_SIDD_SBYTE_OPS(mmC, mmB) == 1); 855 assert(_mm_cmpistrz!_SIDD_UWORD_OPS(mmA, mmC) == 0); 856 } 857 858 859 /// Starting with the initial value in `crc`, accumulates a CR32 value 860 /// for unsigned 16-bit integer `v`. 861 /// Warning: this is computing CRC-32C (Castagnoli), not CRC-32. 862 uint _mm_crc32_u16 (uint crc, ushort v) @safe 863 { 864 static if (GDC_with_SSE42) 865 { 866 return __builtin_ia32_crc32hi(crc, v); 867 } 868 else static if (LDC_with_SSE42) 869 { 870 return __builtin_ia32_crc32hi(crc, v); 871 } 872 else static if (LDC_with_ARM64_CRC) 873 { 874 return __crc32ch(crc, v); 875 } 876 else 877 { 878 crc = _mm_crc32_u8(crc, v & 0xff); 879 crc = _mm_crc32_u8(crc, v >> 8); 880 return crc; 881 } 882 } 883 unittest 884 { 885 uint A = _mm_crc32_u16(0x12345678, 0x4512); 886 uint B = _mm_crc32_u16(0x76543210, 0xf50f); 887 uint C = _mm_crc32_u16(0xDEADBEEF, 0x0017); 888 assert(A == 0x39c3f0ff); 889 assert(B == 0xcffbcf07); 890 assert(C == 0xc7e3fe85); 891 } 892 893 /// Starting with the initial value in `crc`, accumulates a CRC32 value 894 /// for unsigned 32-bit integer `v`. 895 /// Warning: this is computing CRC-32C (Castagnoli), not CRC-32. 896 uint _mm_crc32_u32 (uint crc, uint v) @safe 897 { 898 static if (GDC_with_SSE42) 899 { 900 return __builtin_ia32_crc32si(crc, v); 901 } 902 else static if (LDC_with_SSE42) 903 { 904 return __builtin_ia32_crc32si(crc, v); 905 } 906 else static if (LDC_with_ARM64_CRC) 907 { 908 return __crc32cw(crc, v); 909 } 910 else 911 { 912 crc = _mm_crc32_u8(crc, v & 0xff); 913 crc = _mm_crc32_u8(crc, (v >> 8) & 0xff); 914 crc = _mm_crc32_u8(crc, (v >> 16) & 0xff); 915 crc = _mm_crc32_u8(crc, (v >> 24) & 0xff); 916 return crc; 917 } 918 } 919 unittest 920 { 921 uint A = _mm_crc32_u32(0x12345678, 0x45123563); 922 uint B = _mm_crc32_u32(0x76543210, 0xf50f9993); 923 uint C = _mm_crc32_u32(0xDEADBEEF, 0x00170017); 924 assert(A == 0x22a6ec54); 925 assert(B == 0x7019a6cf); 926 assert(C == 0xbc552c27); 927 } 928 929 /// Starting with the initial value in `crc`, accumulates a CRC32 930 /// value for unsigned 64-bit integer `v`. 931 /// Warning: this is computing CRC-32C (Castagnoli), not CRC-32. 932 ulong _mm_crc32_u64 (ulong crc, ulong v) 933 { 934 version(X86_64) 935 enum bool hasX86Intrin = GDC_with_SSE42 || LDC_with_SSE42; 936 else 937 enum bool hasX86Intrin = false; // intrinsics not available in 32-bit 938 939 static if (hasX86Intrin) 940 { 941 return __builtin_ia32_crc32di(crc, v); 942 } 943 else static if (LDC_with_ARM64_CRC) 944 { 945 return __crc32cd(cast(uint)crc, v); 946 } 947 else 948 { 949 uint crc32 = cast(uint)crc; 950 crc32 = _mm_crc32_u8(crc32, (v >> 0) & 0xff); 951 crc32 = _mm_crc32_u8(crc32, (v >> 8) & 0xff); 952 crc32 = _mm_crc32_u8(crc32, (v >> 16) & 0xff); 953 crc32 = _mm_crc32_u8(crc32, (v >> 24) & 0xff); 954 crc32 = _mm_crc32_u8(crc32, (v >> 32) & 0xff); 955 crc32 = _mm_crc32_u8(crc32, (v >> 40) & 0xff); 956 crc32 = _mm_crc32_u8(crc32, (v >> 48) & 0xff); 957 crc32 = _mm_crc32_u8(crc32, (v >> 56) & 0xff); 958 return crc32; 959 } 960 } 961 unittest 962 { 963 ulong A = _mm_crc32_u64(0x1234567812345678, 0x39C3F0FFCFFBCF07); 964 ulong B = _mm_crc32_u64(0x7654321001234567, 0xFACEFEED); 965 ulong C = _mm_crc32_u64(0xDEADBEEFCAFEBABE, 0x0017C7E3FE850017); 966 assert(A == 0xd66b1074); 967 assert(B == 0xac12f9c6); 968 assert(C == 0xa2d13dd8); 969 } 970 971 /// Starting with the initial value in `crc`, accumulates a CRC32 value 972 /// for unsigned 8-bit integer `v`. 973 /// Warning: this is computing CRC-32C (Castagnoli), not CRC-32. 974 uint _mm_crc32_u8 (uint crc, ubyte v) @safe 975 { 976 static if (GDC_with_SSE42) 977 { 978 return __builtin_ia32_crc32qi(crc, v); 979 } 980 else static if (LDC_with_SSE42) 981 { 982 return __builtin_ia32_crc32qi(crc, v); 983 } 984 else static if (LDC_with_ARM64_CRC) 985 { 986 return __crc32cb(crc, v); 987 } 988 else 989 { 990 return CRC32cTable[(crc ^ v) & 0xFF] ^ (crc >> 8); 991 } 992 } 993 unittest 994 { 995 uint A = _mm_crc32_u8(0x12345678, 0x45); 996 uint B = _mm_crc32_u8(0x76543210, 0xf5); 997 uint C = _mm_crc32_u8(0xDEADBEEF, 0x00); 998 assert(A == 0x8fd93134); 999 assert(B == 0xd6b7e834); 1000 assert(C == 0xbdfd3980); 1001 } 1002 1003 1004 // Utilities for this file 1005 1006 private: 1007 1008 static if (GDC_with_SSE42) 1009 { 1010 version(X86_64) 1011 enum bool NeedCRC32CTable = false; 1012 else 1013 enum bool NeedCRC32CTable = true; 1014 } 1015 else static if (LDC_with_SSE42) 1016 { 1017 version(X86_64) 1018 enum bool NeedCRC32CTable = false; 1019 else 1020 enum bool NeedCRC32CTable = true; 1021 } 1022 else static if (LDC_with_ARM64_CRC) 1023 { 1024 enum bool NeedCRC32CTable = false; 1025 } 1026 else 1027 { 1028 enum bool NeedCRC32CTable = true; 1029 } 1030 1031 static if (NeedCRC32CTable) 1032 { 1033 static immutable uint[256] CRC32cTable = 1034 [ 1035 0x0, 0xf26b8303, 0xe13b70f7, 0x1350f3f4, 0xc79a971f, 0x35f1141c, 0x26a1e7e8, 0xd4ca64eb, 1036 0x8ad958cf, 0x78b2dbcc, 0x6be22838, 0x9989ab3b, 0x4d43cfd0, 0xbf284cd3, 0xac78bf27, 0x5e133c24, 1037 0x105ec76f, 0xe235446c, 0xf165b798, 0x30e349b, 0xd7c45070, 0x25afd373, 0x36ff2087, 0xc494a384, 1038 0x9a879fa0, 0x68ec1ca3, 0x7bbcef57, 0x89d76c54, 0x5d1d08bf, 0xaf768bbc, 0xbc267848, 0x4e4dfb4b, 1039 0x20bd8ede, 0xd2d60ddd, 0xc186fe29, 0x33ed7d2a, 0xe72719c1, 0x154c9ac2, 0x61c6936, 0xf477ea35, 1040 0xaa64d611, 0x580f5512, 0x4b5fa6e6, 0xb93425e5, 0x6dfe410e, 0x9f95c20d, 0x8cc531f9, 0x7eaeb2fa, 1041 0x30e349b1, 0xc288cab2, 0xd1d83946, 0x23b3ba45, 0xf779deae, 0x5125dad, 0x1642ae59, 0xe4292d5a, 1042 0xba3a117e, 0x4851927d, 0x5b016189, 0xa96ae28a, 0x7da08661, 0x8fcb0562, 0x9c9bf696, 0x6ef07595, 1043 0x417b1dbc, 0xb3109ebf, 0xa0406d4b, 0x522bee48, 0x86e18aa3, 0x748a09a0, 0x67dafa54, 0x95b17957, 1044 0xcba24573, 0x39c9c670, 0x2a993584, 0xd8f2b687, 0xc38d26c, 0xfe53516f, 0xed03a29b, 0x1f682198, 1045 0x5125dad3, 0xa34e59d0, 0xb01eaa24, 0x42752927, 0x96bf4dcc, 0x64d4cecf, 0x77843d3b, 0x85efbe38, 1046 0xdbfc821c, 0x2997011f, 0x3ac7f2eb, 0xc8ac71e8, 0x1c661503, 0xee0d9600, 0xfd5d65f4, 0xf36e6f7, 1047 0x61c69362, 0x93ad1061, 0x80fde395, 0x72966096, 0xa65c047d, 0x5437877e, 0x4767748a, 0xb50cf789, 1048 0xeb1fcbad, 0x197448ae, 0xa24bb5a, 0xf84f3859, 0x2c855cb2, 0xdeeedfb1, 0xcdbe2c45, 0x3fd5af46, 1049 0x7198540d, 0x83f3d70e, 0x90a324fa, 0x62c8a7f9, 0xb602c312, 0x44694011, 0x5739b3e5, 0xa55230e6, 1050 0xfb410cc2, 0x92a8fc1, 0x1a7a7c35, 0xe811ff36, 0x3cdb9bdd, 0xceb018de, 0xdde0eb2a, 0x2f8b6829, 1051 0x82f63b78, 0x709db87b, 0x63cd4b8f, 0x91a6c88c, 0x456cac67, 0xb7072f64, 0xa457dc90, 0x563c5f93, 1052 0x82f63b7, 0xfa44e0b4, 0xe9141340, 0x1b7f9043, 0xcfb5f4a8, 0x3dde77ab, 0x2e8e845f, 0xdce5075c, 1053 0x92a8fc17, 0x60c37f14, 0x73938ce0, 0x81f80fe3, 0x55326b08, 0xa759e80b, 0xb4091bff, 0x466298fc, 1054 0x1871a4d8, 0xea1a27db, 0xf94ad42f, 0xb21572c, 0xdfeb33c7, 0x2d80b0c4, 0x3ed04330, 0xccbbc033, 1055 0xa24bb5a6, 0x502036a5, 0x4370c551, 0xb11b4652, 0x65d122b9, 0x97baa1ba, 0x84ea524e, 0x7681d14d, 1056 0x2892ed69, 0xdaf96e6a, 0xc9a99d9e, 0x3bc21e9d, 0xef087a76, 0x1d63f975, 0xe330a81, 0xfc588982, 1057 0xb21572c9, 0x407ef1ca, 0x532e023e, 0xa145813d, 0x758fe5d6, 0x87e466d5, 0x94b49521, 0x66df1622, 1058 0x38cc2a06, 0xcaa7a905, 0xd9f75af1, 0x2b9cd9f2, 0xff56bd19, 0xd3d3e1a, 0x1e6dcdee, 0xec064eed, 1059 0xc38d26c4, 0x31e6a5c7, 0x22b65633, 0xd0ddd530, 0x417b1db, 0xf67c32d8, 0xe52cc12c, 0x1747422f, 1060 0x49547e0b, 0xbb3ffd08, 0xa86f0efc, 0x5a048dff, 0x8ecee914, 0x7ca56a17, 0x6ff599e3, 0x9d9e1ae0, 1061 0xd3d3e1ab, 0x21b862a8, 0x32e8915c, 0xc083125f, 0x144976b4, 0xe622f5b7, 0xf5720643, 0x7198540, 1062 0x590ab964, 0xab613a67, 0xb831c993, 0x4a5a4a90, 0x9e902e7b, 0x6cfbad78, 0x7fab5e8c, 0x8dc0dd8f, 1063 0xe330a81a, 0x115b2b19, 0x20bd8ed, 0xf0605bee, 0x24aa3f05, 0xd6c1bc06, 0xc5914ff2, 0x37faccf1, 1064 0x69e9f0d5, 0x9b8273d6, 0x88d28022, 0x7ab90321, 0xae7367ca, 0x5c18e4c9, 0x4f48173d, 0xbd23943e, 1065 0xf36e6f75, 0x105ec76, 0x12551f82, 0xe03e9c81, 0x34f4f86a, 0xc69f7b69, 0xd5cf889d, 0x27a40b9e, 1066 0x79b737ba, 0x8bdcb4b9, 0x988c474d, 0x6ae7c44e, 0xbe2da0a5, 0x4c4623a6, 0x5f16d052, 0xad7d5351, 1067 ]; 1068 } 1069 1070 int findLengthByte(__m128i a) pure @safe 1071 { 1072 const __m128i zero = _mm_setzero_si128(); 1073 const __m128i zeroMask = _mm_cmpeq_epi8(a, zero); // 0xff where a byte is zero 1074 int mask = _mm_movemask_epi8(zeroMask); // the lowest set bit is the zero index 1075 if (mask == 0) 1076 return 16; 1077 else 1078 return bsf(mask); 1079 } 1080 unittest 1081 { 1082 char[16] A = "Hel!o"; 1083 char[16] B = "Maximum length!!"; 1084 __m128i mmA = _mm_loadu_si128(cast(__m128i*)A.ptr); 1085 __m128i mmB = _mm_loadu_si128(cast(__m128i*)B.ptr); 1086 assert(findLengthByte(mmA) == 5); 1087 assert(findLengthByte(mmB) == 16); 1088 } 1089 1090 int findLengthShort(__m128i a) pure @safe 1091 { 1092 const __m128i zero = _mm_setzero_si128(); 1093 const __m128i zeroMask = _mm_cmpeq_epi16(a, zero); // 0xffff where a short is zero 1094 int mask = _mm_movemask_epi8(zeroMask); // the lowest set bit is the zero index 1095 if (mask == 0) 1096 return 8; 1097 else 1098 return bsf(mask) >> 1; 1099 } 1100 unittest 1101 { 1102 short[8] A = [10, 5423, 475, 0, 1, 1, 1, 1 ]; 1103 short[8] B = [-1, -2, -3, 4, 5, 6, -32768, 1]; 1104 __m128i mmA = _mm_loadu_si128(cast(__m128i*)A.ptr); 1105 __m128i mmB = _mm_loadu_si128(cast(__m128i*)B.ptr); 1106 assert(findLengthShort(mmA) == 3); 1107 assert(findLengthShort(mmB) == 8); 1108 } 1109 1110 static immutable byte[32] MASK_DATA = 1111 [ 1112 -1, -1, -1, -1, -1, -1, -1, -1, 1113 -1, -1, -1, -1, -1, -1, -1, -1, 1114 0, 0, 0, 0, 0, 0, 0, 0, 1115 0, 0, 0, 0, 0, 0, 0, 0, 1116 ]; 1117 1118 // Makes a byte validity mask with a given explicit length string. 1119 __m128i validMask8e(int len) @trusted 1120 { 1121 return _mm_loadu_si128(cast(__m128i*) &MASK_DATA[16-len]); 1122 } 1123 unittest 1124 { 1125 char[16] A = ""; 1126 char[16] B = "0123456789abcdef"; 1127 byte[16] correctA = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]; 1128 byte[16] correctB = [-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1]; 1129 byte16 MA = cast(byte16) validMask8e(0); 1130 byte16 MB = cast(byte16) validMask8e(16); 1131 assert(MA.array == correctA); 1132 assert(MB.array == correctB); 1133 } 1134 1135 // Makes a short validity mask with a given explicit length string. 1136 __m128i validMask16e(int len) @trusted 1137 { 1138 return _mm_loadu_si128(cast(__m128i*) &MASK_DATA[16-len*2]); 1139 } 1140 unittest 1141 { 1142 short[8] A = [3, 4, 5, 0, 3, 4, 5, 6]; 1143 short[8] correctA = [-1, -1, -1, 0, 0, 0, 0, 0]; 1144 short8 MA = cast(short8) validMask16e(3); 1145 assert(MA.array == correctA); 1146 } 1147 1148 // Internal implementation for non-SSE4.2 1149 // Compare 8-bit or 16-bit strings, get a mask. 1150 // `aValid` and `bValid` are byte-mask or word-mask of the valid 1151 // zone in `a` and `b`. 1152 __m128i cmpstrMaskExplicit(int imm8)(__m128i a, 1153 ref int la, 1154 __m128i b, 1155 ref int lb) @safe 1156 { 1157 // saturates lengths (the Intrinsics Guide doesn't tell this) 1158 if (la < 0) la = -la; 1159 if (lb < 0) lb = -lb; 1160 if (la > 16) la = 16; 1161 if (lb > 16) lb = 16; 1162 1163 static if (imm8 & 1) 1164 { 1165 __m128i aValid = validMask16e(la); 1166 __m128i bValid = validMask16e(lb); 1167 } 1168 else 1169 { 1170 __m128i aValid = validMask8e(la); 1171 __m128i bValid = validMask8e(lb); 1172 } 1173 return cmpstrMask!imm8(a, aValid, b, bValid); 1174 } 1175 1176 //ditto 1177 __m128i cmpstrMask(int imm8)(__m128i a, 1178 __m128i aValid, 1179 __m128i b, 1180 const __m128i bValid) @safe 1181 { 1182 enum bool chars16Bits = imm8 & 1; 1183 enum int Mode = (imm8 >> 2) & 3; 1184 1185 static if (Mode == 0) // equal any 1186 { 1187 __m128i R = _mm_setzero_si128(); 1188 static if (chars16Bits) // 64 comparisons 1189 { 1190 for (int k = 0; k < 8; ++k) 1191 { 1192 __m128i equalMask = _mm_cmpeq_epi16(a, b); 1193 equalMask = _mm_and_si128(equalMask, aValid); 1194 R = _mm_or_si128(R, equalMask); 1195 1196 // rotate a and aValid 1197 a = _mm_or_si128(_mm_srli_si128!2(a), _mm_slli_si128!14(a)); 1198 aValid = _mm_or_si128(_mm_srli_si128!2(aValid), _mm_slli_si128!14(aValid)); 1199 } 1200 } 1201 else 1202 { 1203 for (int k = 0; k < 16; ++k) 1204 { 1205 __m128i equalMask = _mm_cmpeq_epi8(a, b); 1206 equalMask = _mm_and_si128(equalMask, aValid); 1207 R = _mm_or_si128(R, equalMask); 1208 1209 // rotate a and aValid 1210 a = _mm_or_si128(_mm_srli_si128!1(a), _mm_slli_si128!15(a)); 1211 aValid = _mm_or_si128(_mm_srli_si128!1(aValid), _mm_slli_si128!15(aValid)); 1212 } 1213 } 1214 R = _mm_and_si128(R, bValid); 1215 } 1216 else static if (Mode == 1) // ranges 1217 { 1218 enum bool signed = (imm8 & 2) != 0; 1219 1220 // For each character in b, the returned mask says if it was found in a range-pair in `a`. 1221 __m128i R = _mm_setzero_si128(); 1222 static if (chars16Bits) 1223 { 1224 for (int pos = 0; pos < 8; pos += 2) 1225 { 1226 short min = (cast(short8)a).array[pos]; 1227 short max = (cast(short8)a).array[pos+1]; 1228 static if (signed) 1229 { 1230 __m128i ge = ~_mm_cmplt_epi16(b, _mm_set1_epi16(min)); 1231 __m128i le = ~_mm_cmpgt_epi16(b, _mm_set1_epi16(max)); 1232 } 1233 else 1234 { 1235 // No SSE way to do 16-bit unsigned comparisons, 1236 // but flipping the sign bit let us used signed comp 1237 __m128i firstBits = _mm_set1_epi16(-32768); 1238 __m128i reverseB = _mm_xor_si128(b, firstBits); 1239 __m128i reverseMin = _mm_xor_si128(_mm_set1_epi16(min), firstBits); 1240 __m128i reverseMax = _mm_xor_si128(_mm_set1_epi16(max), firstBits); 1241 __m128i ge = ~_mm_cmplt_epi16(reverseB, reverseMin); 1242 __m128i le = ~_mm_cmpgt_epi16(reverseB, reverseMax); 1243 } 1244 __m128i inRange = _mm_and_si128(le, ge); 1245 1246 // Not considered in range a is invalid here. 1247 short aValidHere = (cast(short8)aValid).array[pos+1]; 1248 __m128i mmAValidHere = _mm_set1_epi16(aValidHere); 1249 inRange = _mm_and_si128(inRange, mmAValidHere); 1250 1251 R = _mm_or_si128(R, inRange); 1252 } 1253 } 1254 else // 8-bits 1255 { 1256 for (int pos = 0; pos < 16; pos += 2) 1257 { 1258 byte min = (cast(byte16)a).array[pos]; 1259 byte max = (cast(byte16)a).array[pos+1]; 1260 static if (signed) 1261 { 1262 __m128i ge = _mm_xor_si128(_mm_cmplt_epi8(b, _mm_set1_epi8(min))); 1263 __m128i le = _mm_xor_si128(_mm_cmpgt_epi8(b, _mm_set1_epi8(max))); 1264 } 1265 else 1266 { 1267 // No SSE way to do 16-bit unsigned comparisons, 1268 // but flipping the sign bit let us used signed comp 1269 __m128i firstBits = _mm_set1_epi8(-128); 1270 __m128i reverseB = _mm_xor_si128(b, firstBits); 1271 __m128i reverseMin = _mm_xor_si128(_mm_set1_epi8(min), firstBits); 1272 __m128i reverseMax = _mm_xor_si128(_mm_set1_epi8(max), firstBits); 1273 __m128i ge = ~_mm_cmplt_epi8(reverseB, reverseMin); 1274 __m128i le = ~_mm_cmpgt_epi8(reverseB, reverseMax); 1275 } 1276 __m128i inRange = _mm_and_si128(le, ge); 1277 1278 // Not considered in range a is invalid here. 1279 byte aValidHere = (cast(byte16)aValid).array[pos+1]; 1280 __m128i mmAValidHere = _mm_set1_epi8(aValidHere); 1281 inRange = _mm_and_si128(inRange, mmAValidHere); 1282 1283 R = _mm_or_si128(R, inRange); 1284 } 1285 } 1286 // invalid b part is not in range 1287 R = _mm_and_si128(R, bValid); 1288 } 1289 else static if (Mode == 2) // equal each, just 16 comparisons not 256 1290 { 1291 static if (chars16Bits) 1292 { 1293 __m128i R = _mm_cmpeq_epi16(a, b); 1294 } 1295 else 1296 { 1297 __m128i R = _mm_cmpeq_epi8(a, b); 1298 } 1299 1300 // if only a or b is invalid, consider not equal 1301 R = _mm_andnot_si128(_mm_xor_si128(aValid, bValid), R); 1302 1303 // if a and b are both invalid, consider equal 1304 R = _mm_or_si128(R, ~_mm_or_si128(aValid, bValid)); 1305 } 1306 else static if (Mode == 3) // equal ordered 1307 { 1308 // a is searched in b. 1309 1310 __m128i bValidShift = bValid; 1311 1312 __m128i R = _mm_set1_epi32(-1); // all b positions possible for containing a 1313 static if (chars16Bits) 1314 { 1315 for (int pos = 0; pos < 8; ++pos) 1316 { 1317 // compare character k of a, where can it go in b? 1318 short charK = (cast(short8)a).array[pos]; 1319 __m128i mmcharK = _mm_set1_epi16(charK); 1320 1321 short aValidHere = (cast(short8)aValid).array[pos]; 1322 __m128i mmAValidHere = _mm_set1_epi16(aValidHere); 1323 __m128i mmAInvalidHere = _mm_xor_si128(mmAValidHere, _mm_set1_epi32(-1)); 1324 __m128i equalMask = _mm_cmpeq_epi16(mmcharK, b); 1325 1326 // Where A is invalid, the comparison always holds "equal" 1327 equalMask = _mm_or_si128(equalMask, mmAInvalidHere); 1328 1329 // Where B is invalid, and A is valid, the comparison is forced to false 1330 equalMask = _mm_and_si128(equalMask, _mm_or_si128(bValidShift, mmAInvalidHere)); 1331 1332 R = _mm_and_si128(equalMask); 1333 1334 // drop first char of b 1335 b = _mm_srli_si128!2(b); 1336 bValidShift = _mm_srli_si128!2(bValidShift); 1337 } 1338 } 1339 else 1340 { 1341 for (int pos = 0; pos < 16; ++pos) 1342 { 1343 // compare character k of a, where can it go in b? 1344 byte charK = (cast(byte16)a).array[pos]; 1345 __m128i mmcharK = _mm_set1_epi8(charK); 1346 1347 byte aValidHere = (cast(byte16)aValid).array[pos]; 1348 __m128i mmAValidHere = _mm_set1_epi8(aValidHere); 1349 __m128i mmAInvalidHere = _mm_xor_si128(mmAValidHere, _mm_set1_epi32(-1)); 1350 __m128i equalMask = _mm_cmpeq_epi8(mmcharK, b); 1351 1352 // Where A is invalid, the comparison always holds "equal" 1353 equalMask = _mm_or_si128(equalMask, mmAInvalidHere); 1354 1355 // Where B is invalid, and A is valid, the comparison is forced to false 1356 equalMask = _mm_and_si128(equalMask, _mm_or_si128(bValidShift, mmAInvalidHere)); 1357 1358 R = _mm_and_si128(R, equalMask); 1359 1360 // drop first char of b 1361 b = _mm_srli_si128!1(b); 1362 bValidShift = _mm_srli_si128!1(bValidShift); 1363 } 1364 } 1365 } 1366 else 1367 static assert(0); 1368 1369 // Optionally negate result 1370 static if (imm8 & _SIDD_NEGATIVE_POLARITY) 1371 { 1372 static if (imm8 & _SIDD_MASKED_POSITIVE_POLARITY) 1373 { 1374 R = _mm_xor_si128(R, bValid); // only negate valid b 1375 } 1376 else 1377 { 1378 R = _mm_xor_si128(R, _mm_set1_epi32(-1)); // negate all 1379 } 1380 } 1381 return R; 1382 }