1 /** 2 * SSE4.2 intrinsics. 3 * https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#techs=SSSE3 4 * 5 * Copyright: Guillaume Piolat 2022. 6 * License: $(LINK2 http://www.boost.org/LICENSE_1_0.txt, Boost License 1.0) 7 */ 8 module inteli.nmmintrin; 9 10 public import inteli.types; 11 import inteli.internals; 12 public import inteli.smmintrin; 13 import core.bitop: bsf, bsr; 14 15 16 // Note: this header will work whether you have SSE4.2 enabled or not. 17 // With LDC, use "dflags-ldc": ["-mattr=+sse4.2"] or equivalent to actively 18 // generate SSE4.2 instruction (they are often enabled with -O1 or greater). 19 // Additionally, you need ["-mattr=+crc"] on ARM if you want hardware CRC instructions. 20 // With GDC, use "dflags-gdc": ["-msse4.2"] or equivalent to generate SSE4.2 instructions. 21 22 nothrow @nogc: 23 24 // <Data size and signedness> 25 26 /// String contains unsigned 8-bit characters (default). 27 enum int _SIDD_UBYTE_OPS = 0; 28 29 /// String contains unsigned 16-bit characters. 30 enum int _SIDD_UWORD_OPS = 1; 31 32 /// String contains signed 8-bit characters. 33 enum int _SIDD_SBYTE_OPS = 2; 34 35 /// String contains signed 16-bit characters. 36 enum int _SIDD_SWORD_OPS = 3; 37 38 // </Data size and signedness> 39 40 41 // <Comparison options> 42 43 /// For each character in `b`, find if it is in `a` (default) 44 /// The resulting mask has bit set at b positions that were found in a. 45 enum int _SIDD_CMP_EQUAL_ANY = 0; 46 47 /// For each character in `b`, determine if 48 /// `a[0] <= c <= a[1] or a[1] <= c <= a[2]...` 49 /// Contrarily to false documentation on the Internet, pairs must be in `a`! 50 enum int _SIDD_CMP_RANGES = 4; 51 52 /// The strings defined by `a` and `b` are equal 53 enum int _SIDD_CMP_EQUAL_EACH = 8; 54 55 /// Search for the defined substring in the target 56 enum int _SIDD_CMP_EQUAL_ORDERED = 12; 57 58 // </Comparison options> 59 60 // <Result polarity> 61 62 /// Do not negate results (default, no effect) 63 enum int _SIDD_POSITIVE_POLARITY = 0; 64 65 /// Negates results 66 enum int _SIDD_NEGATIVE_POLARITY = 16; 67 68 /// No effect. Do not negate results before the end of the string. (default when using `_SIDD_NEGATIVE_POLARITY`) 69 /// You basically never want this. 70 enum int _SIDD_MASKED_POSITIVE_POLARITY = 32; 71 72 /// Negates results only before the end of the string 73 enum int _SIDD_MASKED_NEGATIVE_POLARITY = 48; 74 75 // </Result polarity> 76 77 // <Bit returned> 78 79 /// **Index only**: return the least significant bit (default). 80 enum int _SIDD_LEAST_SIGNIFICANT = 0; 81 82 /// **Index only**: return the most significant bit. 83 enum int _SIDD_MOST_SIGNIFICANT = 64; 84 85 // </Bit returned> 86 87 /// **Mask only**: return the bit mask (default). 88 enum int _SIDD_BIT_MASK = 0; 89 90 /// **Mask only**: return the byte/word mask. 91 enum int _SIDD_UNIT_MASK = 64; 92 93 /// So SSE4.2 has a lot of hard-to-understand instructions. Here is another explanations. 94 /// 95 /// Alternative explanation of imm8 96 /// 97 /// imm8 is an 8-bit immediate operand specifying whether the characters are bytes or 98 /// words and the type of comparison to do. 99 /// 100 /// Bits [1:0]: Determine source data format. 101 /// 00: 16 unsigned bytes 102 /// 01: 8 unsigned words 103 /// 10: 16 signed bytes 104 /// 11: 8 signed words 105 /// 106 /// Bits [3:2]: Determine comparison type and aggregation method. 107 /// 00: Subset: Each character in B is compared for equality with all 108 /// the characters in A. 109 /// 01: Ranges: Each character in B is compared to A pairs. The comparison 110 /// basis is greater than or equal for even-indexed elements in A, 111 /// and less than or equal for odd-indexed elements in A. 112 /// 10: Match: Compare each pair of corresponding characters in A and 113 /// B for equality. 114 /// 11: Substring: Search B for substring matches of A. 115 /// 116 /// Bits [5:4]: Determine whether to do a one's complement on the bit 117 /// mask of the comparison results. \n 118 /// 00: No effect. \n 119 /// 01: Negate the bit mask. \n 120 /// 10: No effect. \n 121 /// 11: Negate the bit mask only for bits with an index less than or equal 122 /// to the size of \a A or \a B. 123 /// 124 125 126 127 /// Compare packed strings in `a` and `b` with lengths `la` and `lb` using 128 /// the control in `imm8`, and returns 1 if `b` "does not contain a null character" 129 /// and the resulting mask was zero, and 0 otherwise. 130 /// Warning: actually it seems the instruction does accept \0 in input, just the length must be >= count. 131 /// It's not clear for what purpose. 132 int _mm_cmpestra(int imm8)(__m128i a, int la, __m128i b, int lb) @trusted 133 { 134 static if (GDC_with_SSE42) 135 { 136 return cast(int) __builtin_ia32_pcmpestria128(cast(ubyte16)a, la, cast(ubyte16)b, lb, imm8); 137 } 138 else static if (LDC_with_SSE42) 139 { 140 return __builtin_ia32_pcmpestria128(cast(byte16)a, la, cast(byte16)b, lb, imm8); 141 } 142 else 143 { 144 __m128i mask = cmpstrMaskExplicit!imm8(a, la, b, lb); 145 __m128i equalZero = _mm_cmpeq_epi8(mask, _mm_setzero_si128()); 146 int sigbits = _mm_movemask_epi8(equalZero); 147 enum int Count = (imm8 & 1) ? 8 : 16; 148 return (sigbits == 0xffff) && (lb >= Count); 149 } 150 } 151 unittest 152 { 153 char[16] A = "Maximum\x00length!!"; 154 char[16] B = "Mbximum\x00length!!"; 155 __m128i mmA = _mm_loadu_si128(cast(__m128i*)A.ptr); 156 __m128i mmB = _mm_loadu_si128(cast(__m128i*)B.ptr); 157 158 // string matching a-la strcmp, for 16-bytes of data 159 // Use _SIDD_NEGATIVE_POLARITY since mask must be null, and all match must be one 160 assert(1 == _mm_cmpestra!(_SIDD_UBYTE_OPS 161 | _SIDD_CMP_EQUAL_EACH 162 | _SIDD_NEGATIVE_POLARITY)(mmA, 16, mmA, 16)); 163 assert(0 == _mm_cmpestra!(_SIDD_UBYTE_OPS 164 | _SIDD_CMP_EQUAL_EACH 165 | _SIDD_NEGATIVE_POLARITY)(mmA, 16, mmB, 16)); 166 167 // test negative length, this will be clamped to 16 168 assert(1 == _mm_cmpestra!(_SIDD_UBYTE_OPS 169 | _SIDD_CMP_EQUAL_EACH 170 | _SIDD_NEGATIVE_POLARITY)(mmA, -160, mmA, -17)); 171 172 // it seems you can't compare shorter strings for equality using _mm_cmpestra (!) 173 174 // Test 16-bit format 175 assert(1 == _mm_cmpestra!(_SIDD_SWORD_OPS 176 | _SIDD_CMP_EQUAL_EACH 177 | _SIDD_NEGATIVE_POLARITY)(mmA, 8, mmA, 8)); 178 } 179 180 /// Compare packed strings in `a` and `b` with lengths `la` and `lb` using 181 /// the control in `imm8`, and returns 1 if the resulting mask was non-zero, 182 /// and 0 otherwise. 183 int _mm_cmpestrc(int imm8)(__m128i a, int la, __m128i b, int lb) @trusted 184 { 185 static if (GDC_with_SSE42) 186 { 187 return cast(int) __builtin_ia32_pcmpestric128(cast(ubyte16)a, la, cast(ubyte16)b, lb, imm8); 188 } 189 else static if (LDC_with_SSE42) 190 { 191 return cast(int) __builtin_ia32_pcmpestric128(cast(byte16)a, la, cast(byte16)b, lb, imm8); 192 } 193 else 194 { 195 __m128i mask = cmpstrMaskExplicit!imm8(a, la, b, lb); 196 int sigbits = _mm_movemask_epi8(mask); 197 return (sigbits != 0); 198 } 199 } 200 unittest 201 { 202 // Compare two shorter strings 203 { 204 char[16] A = "Hello world"; 205 char[16] B = "Hello moon"; 206 __m128i mmA = _mm_loadu_si128(cast(__m128i*)A.ptr); 207 __m128i mmB = _mm_loadu_si128(cast(__m128i*)B.ptr); 208 __m128i mask = _mm_cmpestrm!(_SIDD_UBYTE_OPS // match gives 0 like strcmp 209 | _SIDD_CMP_EQUAL_EACH 210 | _SIDD_NEGATIVE_POLARITY)(mmA, 6, mmB, 6); 211 assert(0 == _mm_cmpestrc!(_SIDD_UBYTE_OPS // match gives 0 like strcmp 212 | _SIDD_CMP_EQUAL_EACH 213 | _SIDD_NEGATIVE_POLARITY)(mmA, 6, mmB, 6)); 214 assert(1 == _mm_cmpestrc!(_SIDD_UBYTE_OPS 215 | _SIDD_CMP_EQUAL_EACH 216 | _SIDD_NEGATIVE_POLARITY)(mmA, 7, mmB, 7)); 217 } 218 } 219 220 /// Compare packed strings in `a` and `b` with lengths `la` and `lb` using 221 /// the control in `imm8`, and return the generated index. 222 /// Note: if the mask is all zeroes, the returned index is always `Count` 223 /// (8 or 16 depending on size). 224 int _mm_cmpestri(int imm8)(__m128i a, int la, __m128i b, int lb) @trusted 225 { 226 static if (GDC_with_SSE42) 227 { 228 return __builtin_ia32_pcmpestri128(cast(ubyte16)a, la, cast(ubyte16)b, lb, imm8); 229 } 230 else static if (LDC_with_SSE42) 231 { 232 return __builtin_ia32_pcmpestri128(cast(byte16)a, la, cast(byte16)b, lb, imm8); 233 } 234 else 235 { 236 __m128i mask = cmpstrMaskExplicit!imm8(a, la, b, lb); 237 238 // Convert the unit mask to bit mask 239 static if (imm8 & 1) 240 { 241 enum int Count = 8; 242 mask = _mm_packs_epi16(mask, _mm_setzero_si128()); 243 } 244 else 245 { 246 enum int Count = 16; 247 } 248 int signbits = _mm_movemask_epi8(mask); 249 static if (imm8 & _SIDD_MOST_SIGNIFICANT) 250 { 251 if (signbits == 0) 252 return Count; 253 else 254 return bsr(signbits); 255 } 256 else 257 { 258 if (signbits == 0) 259 return Count; 260 else 261 return bsf(signbits); 262 } 263 } 264 } 265 unittest 266 { 267 // Find the index of the first difference (at index 6) 268 // v 269 char[16] A = "Hello sun"; 270 char[16] B = "Hello moon"; 271 272 __m128i mmA = _mm_loadu_si128(cast(__m128i*)A.ptr); 273 __m128i mmB = _mm_loadu_si128(cast(__m128i*)B.ptr); 274 275 int index = _mm_cmpestri!(_SIDD_UBYTE_OPS 276 | _SIDD_CMP_EQUAL_EACH 277 | _SIDD_NEGATIVE_POLARITY 278 | _SIDD_LEAST_SIGNIFICANT)(mmA, 9, mmB, 10); 279 assert(index == 6); 280 281 // Those string must compare equal, regardless of what happens after their length. 282 index = _mm_cmpestri!(_SIDD_UBYTE_OPS 283 | _SIDD_CMP_EQUAL_EACH 284 | _SIDD_NEGATIVE_POLARITY 285 | _SIDD_LEAST_SIGNIFICANT)(mmA, 6, mmB, 6); // only look first six chars 286 assert(index == 16); 287 288 index = _mm_cmpestri!(_SIDD_UBYTE_OPS 289 | _SIDD_CMP_EQUAL_EACH 290 | _SIDD_NEGATIVE_POLARITY 291 | _SIDD_MOST_SIGNIFICANT)(mmA, 6, mmB, 6); // only look first six chars 292 assert(index == 16); 293 } 294 unittest 295 { 296 // Identify the last character that isn't an identifier character. 297 // v (at index 7) 298 char[16] A = "my_i(en)ifie"; 299 char[16] identRanges = "__azAz09"; 300 __m128i mmA = _mm_loadu_si128(cast(__m128i*)A.ptr); 301 __m128i mmI = _mm_loadu_si128(cast(__m128i*)identRanges.ptr); 302 byte16 mask = cast(byte16)_mm_cmpestrm!(_SIDD_UBYTE_OPS 303 | _SIDD_CMP_RANGES 304 | _SIDD_MASKED_NEGATIVE_POLARITY 305 | _SIDD_UNIT_MASK)(mmI, 8, mmA, 12); 306 byte[16] correctM = [0, 0, 0, 0, -1, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0]; 307 assert(mask.array == correctM); 308 309 int index = _mm_cmpestri!(_SIDD_UBYTE_OPS 310 | _SIDD_CMP_RANGES 311 | _SIDD_MASKED_NEGATIVE_POLARITY 312 | _SIDD_MOST_SIGNIFICANT)(mmI, 8, mmA, 12); 313 assert(index == 7); // ')' is the last char not to be in [__azAz09] 314 } 315 unittest 316 { 317 // testing _SIDD_CMP_RANGES but with signed shorts comparison instead (this only makes sense for _SIDD_CMP_RANGES) 318 short[8] ranges = [0, -1, 1000, 2000, 0, 0, 0, 0]; 319 short[8] numbers = [-32768, -1000, -1, -0, 0, 1, 1000, 32767]; 320 __m128i mmRanges = _mm_loadu_si128(cast(__m128i*)ranges.ptr); 321 __m128i mmNumbers = _mm_loadu_si128(cast(__m128i*)numbers.ptr); 322 323 short8 mask = cast(short8)_mm_cmpestrm!(_SIDD_UWORD_OPS 324 | _SIDD_CMP_RANGES 325 | _SIDD_UNIT_MASK)(mmRanges, 4, mmNumbers, 8); 326 short[8] correctM = [ -1, -1, -1, -1, -1, -1, -1, -1]; 327 mask = cast(short8)_mm_cmpestrm!(_SIDD_SWORD_OPS 328 | _SIDD_CMP_RANGES 329 | _SIDD_UNIT_MASK)(mmRanges, 4, mmNumbers, 8); 330 short[8] correctZ = [ 0, 0, 0, 0, 0, 0, -1, 0]; 331 assert(mask.array == correctZ); 332 } 333 unittest 334 { 335 // Find a substring 336 char[16] A = "def"; 337 char[16] B = "abcdefghdefff"; 338 char[16] C = "no substring"; 339 __m128i mmA = _mm_loadu_si128(cast(__m128i*)A.ptr); 340 __m128i mmB = _mm_loadu_si128(cast(__m128i*)B.ptr); 341 __m128i mmC = _mm_loadu_si128(cast(__m128i*)C.ptr); 342 343 byte16 mask = cast(byte16)_mm_cmpestrm!(_SIDD_UBYTE_OPS 344 | _SIDD_CMP_EQUAL_ORDERED 345 | _SIDD_UNIT_MASK)(mmA, 3, mmB, 13); 346 byte[16] correctM = [0, 0, 0, -1, 0, 0, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0]; 347 assert(mask.array == correctM); 348 349 int firstMatch = _mm_cmpestri!(_SIDD_UBYTE_OPS 350 | _SIDD_CMP_EQUAL_ORDERED)(mmA, 3, mmB, 13); 351 assert(firstMatch == 3); 352 353 int lastMatch = _mm_cmpestri!(_SIDD_UBYTE_OPS 354 | _SIDD_CMP_EQUAL_ORDERED 355 | _SIDD_MOST_SIGNIFICANT)(mmA, 3, mmB, 13); 356 assert(lastMatch == 8); 357 firstMatch = _mm_cmpestri!(_SIDD_UBYTE_OPS 358 | _SIDD_CMP_EQUAL_ORDERED)(mmA, -3, mmC, -12); 359 assert(firstMatch == 16); // no substring found 360 } 361 362 /// Compare packed strings in `a` and `b` with lengths `la` and `lb` using 363 /// the control in `imm8`, and return the generated mask. 364 __m128i _mm_cmpestrm(int imm8)(__m128i a, int la, __m128i b, int lb) @trusted 365 { 366 static if (GDC_with_SSE42) 367 { 368 return cast(__m128i) __builtin_ia32_pcmpestrm128(cast(ubyte16)a, la, cast(ubyte16)b, lb, imm8); 369 } 370 else static if (LDC_with_SSE42) 371 { 372 return cast(__m128i) __builtin_ia32_pcmpestrm128(cast(byte16)a, la, cast(byte16)b, lb, imm8); 373 } 374 else 375 { 376 __m128i mask = cmpstrMaskExplicit!imm8(a, la, b, lb); 377 378 static if (imm8 & _SIDD_UNIT_MASK) 379 { 380 return mask; 381 } 382 else 383 { 384 // _SIDD_BIT_MASK 385 static if (imm8 & 1) 386 { 387 mask = _mm_packs_epi16(mask, _mm_setzero_si128()); 388 } 389 return _mm_cvtsi32_si128( _mm_movemask_epi8(mask)); 390 } 391 } 392 } 393 unittest 394 { 395 char[16] A = "Hello world!"; 396 char[16] B = "aeiou!"; 397 __m128i mmA = _mm_loadu_si128(cast(__m128i*)A.ptr); 398 __m128i mmB = _mm_loadu_si128(cast(__m128i*)B.ptr); 399 400 // Find which letters from B where found in A. 401 byte16 R = cast(byte16)_mm_cmpestrm!(_SIDD_UBYTE_OPS 402 | _SIDD_CMP_EQUAL_ANY 403 | _SIDD_BIT_MASK)(mmA, -12, mmB, -6); 404 // because 'e', 'o', and '!' were found 405 byte[16] correctR = [42, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]; 406 assert(R.array == correctR); 407 byte16 M = cast(byte16) _mm_cmpestrm!(_SIDD_UBYTE_OPS 408 | _SIDD_CMP_EQUAL_ANY 409 | _SIDD_UNIT_MASK)(mmA, 12, mmB, 6); 410 byte[16] correctM = [0, -1, 0, -1, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]; 411 assert(M.array == correctM); 412 } 413 414 /// Compare packed strings in `a` and `b` with lengths `la` and `lb` using 415 /// the control in `imm8`, and returns bit 0 of the resulting bit mask. 416 int _mm_cmpestro(int imm8)(__m128i a, int la, __m128i b, int lb) @trusted 417 { 418 static if (GDC_with_SSE42) 419 { 420 return __builtin_ia32_pcmpestrio128(cast(ubyte16)a, la, cast(ubyte16)b, lb, imm8); 421 } 422 else static if (LDC_with_SSE42) 423 { 424 return __builtin_ia32_pcmpestrio128(cast(byte16)a, la, cast(byte16)b, lb, imm8); 425 } 426 else 427 { 428 int4 mask = cast(int4) cmpstrMaskExplicit!imm8(a, la, b, lb); 429 return mask.array[0] & 1; 430 } 431 } 432 unittest 433 { 434 char[16] A = "Hallo world!"; 435 char[16] B = "aeiou!"; 436 __m128i mmA = _mm_loadu_si128(cast(__m128i*)A.ptr); 437 __m128i mmB = _mm_loadu_si128(cast(__m128i*)B.ptr); 438 439 // Find which letters from B where found in A. 440 int res = _mm_cmpestro!(_SIDD_UBYTE_OPS 441 | _SIDD_CMP_EQUAL_ANY 442 | _SIDD_BIT_MASK)(mmA, 12, mmB, -6); 443 // because 'a' was found in "Hallo world!" 444 assert(res == 1); 445 } 446 447 /// Returns 1 if "any character in a was null", and 0 otherwise. 448 /// Warning: what they mean is it returns 1 if the given length `la` is < Count. 449 int _mm_cmpestrs(int imm8)(__m128i a, int la, __m128i b, int lb) @trusted 450 { 451 static if (GDC_with_SSE42) 452 { 453 return __builtin_ia32_pcmpestris128(cast(ubyte16)a, la, cast(ubyte16)b, lb, imm8); 454 } 455 else static if (LDC_with_SSE42) 456 { 457 return __builtin_ia32_pcmpestris128(cast(byte16)a, la, cast(byte16)b, lb, imm8); 458 } 459 else 460 { 461 // Yes, this intrinsic is there for symmetrical reasons and probably useless. 462 // saturates lengths (the Intrinsics Guide doesn't tell this) 463 if (la < 0) la = -la; 464 if (la > 16) la = 16; 465 enum int Count = (imm8 & 1) ? 8 : 16; 466 return (la < Count); 467 } 468 } 469 unittest 470 { 471 __m128i a; 472 a = 0; 473 assert(_mm_cmpestrs!_SIDD_UBYTE_OPS(a, 15, a, 8) == 1); 474 assert(_mm_cmpestrs!_SIDD_UBYTE_OPS(a, 16, a, 8) == 0); 475 assert(_mm_cmpestrs!_SIDD_UBYTE_OPS(a, -15, a, 8) == 1); 476 assert(_mm_cmpestrs!_SIDD_UBYTE_OPS(a, -16, a, 8) == 0); 477 } 478 479 /// Returns 1 if "any character in b was null", and 0 otherwise. 480 /// Warning: what they mean is it returns 1 if the given length `lb` is < Count. 481 int _mm_cmpestrz(int imm8)(__m128i a, int la, __m128i b, int lb) @trusted 482 { 483 static if (GDC_with_SSE42) 484 { 485 return __builtin_ia32_pcmpestriz128(cast(ubyte16)a, la, cast(ubyte16)b, lb, imm8); 486 } 487 else static if (LDC_with_SSE42) 488 { 489 return __builtin_ia32_pcmpestriz128(cast(byte16)a, la, cast(byte16)b, lb, imm8); 490 } 491 else 492 { 493 // Yes, this intrinsic is there for symmetrical reasons and probably useless. 494 // saturates lengths (the Intrinsics Guide doesn't tell this) 495 if (lb < 0) lb = -lb; 496 if (lb > 16) lb = 16; 497 enum int Count = (imm8 & 1) ? 8 : 16; 498 return (lb < Count); 499 } 500 } 501 unittest 502 { 503 __m128i b; 504 b = 0; 505 assert(_mm_cmpestrs!_SIDD_UBYTE_OPS(b, 15, b, 15) == 1); 506 assert(_mm_cmpestrs!_SIDD_UBYTE_OPS(b, 16, b, 16) == 0); 507 assert(_mm_cmpestrs!_SIDD_UBYTE_OPS(b, -15, b, -15) == 1); 508 assert(_mm_cmpestrs!_SIDD_UBYTE_OPS(b, -16, b, -16) == 0); 509 } 510 511 /// Compare packed signed 64-bit integers in a and b for greater-than. 512 __m128i _mm_cmpgt_epi64 (__m128i a, __m128i b) @trusted 513 { 514 long2 la = cast(long2)a; 515 long2 lb = cast(long2)b; 516 static if (GDC_with_SSE42) 517 { 518 return cast(__m128i) __builtin_ia32_pcmpgtq(la, lb); 519 } 520 else version(LDC) 521 { 522 // LDC x86: Optimized since LDC 1.1.0 -O1 523 // arm64: Optimized since LDC 1.8.0 -O1 524 // When SSE4.2 is disabled, this gives same sequence than below. 525 return cast(__m128i)( greaterMask!long2(la, lb)); 526 } 527 else 528 { 529 long2 r; 530 r.ptr[0] = (la.array[0] > lb.array[0]) ? 0xffffffff_ffffffff : 0; 531 r.ptr[1] = (la.array[1] > lb.array[1]) ? 0xffffffff_ffffffff : 0; 532 return cast(__m128i)r; 533 } 534 } 535 unittest 536 { 537 __m128i A = _mm_setr_epi64(-3, 2); 538 __m128i B = _mm_setr_epi64(4, -2); 539 long[2] correct = [ 0, -1 ]; 540 long2 R = cast(long2)(_mm_cmpgt_epi32(A, B)); 541 assert(R.array == correct); 542 } 543 544 /// Compare packed strings with implicit lengths in `a` and `b` using the control in `imm8`, 545 /// and returns 1 if `b` did not contain a null character and the resulting mask was zero, 546 /// and 0 otherwise. 547 int _mm_cmpistra(int imm8)(__m128i a, __m128i b) @trusted 548 { 549 static if (GDC_with_SSE42) 550 { 551 return cast(int) __builtin_ia32_pcmpistria128(cast(ubyte16)a, cast(ubyte16)b, imm8); 552 } 553 else static if (LDC_with_SSE42) 554 { 555 return __builtin_ia32_pcmpistria128(cast(byte16)a, cast(byte16)b, imm8); 556 } 557 else 558 { 559 static if (imm8 & 1) 560 { 561 int la = findLengthShort(a); 562 int lb = findLengthShort(b); 563 } 564 else 565 { 566 int la = findLengthByte(a); 567 int lb = findLengthByte(b); 568 } 569 return _mm_cmpestra!imm8(a, la, b, lb); 570 } 571 } 572 unittest 573 { 574 char[16] A = "Maximum\x00one"; 575 char[16] B = "Maximum\x00four"; 576 char[16] C = "Mbximum\x00length!"; 577 __m128i mmA = _mm_loadu_si128(cast(__m128i*)A.ptr); 578 __m128i mmB = _mm_loadu_si128(cast(__m128i*)B.ptr); 579 __m128i mmC = _mm_loadu_si128(cast(__m128i*)C.ptr); 580 581 // string matching a-la strcmp, for 16-bytes of data 582 // Use _SIDD_NEGATIVE_POLARITY since mask must be null, and all match must be one 583 assert(0 == _mm_cmpistra!(_SIDD_UBYTE_OPS 584 | _SIDD_CMP_EQUAL_EACH 585 | _SIDD_MASKED_NEGATIVE_POLARITY)(mmA, mmB)); // match, but b is too short 586 587 assert(0 == _mm_cmpistra!(_SIDD_UBYTE_OPS 588 | _SIDD_CMP_EQUAL_EACH 589 | _SIDD_NEGATIVE_POLARITY)(mmA, mmC)); // do not match 590 } 591 592 /// Compare packed strings with implicit lengths in `a` and `b` using the control in `imm8`, 593 /// and returns 1 if the resulting mask was non-zero, and 0 otherwise. 594 int _mm_cmpistrc(int imm8)(__m128i a, __m128i b) @trusted 595 { 596 static if (GDC_with_SSE42) 597 { 598 return cast(int) __builtin_ia32_pcmpistric128(cast(ubyte16)a, cast(ubyte16)b, imm8); 599 } 600 else static if (LDC_with_SSE42) 601 { 602 return cast(int) __builtin_ia32_pcmpistric128(cast(byte16)a, cast(byte16)b, imm8); 603 } 604 else 605 { 606 static if (imm8 & 1) 607 { 608 int la = findLengthShort(a); 609 int lb = findLengthShort(b); 610 } 611 else 612 { 613 int la = findLengthByte(a); 614 int lb = findLengthByte(b); 615 } 616 return _mm_cmpestrc!imm8(a, la, b, lb); 617 } 618 } 619 unittest 620 { 621 // Compare two shorter strings 622 { 623 char[16] A = "Hello"; 624 char[16] B = "Hello moon"; 625 __m128i mmA = _mm_loadu_si128(cast(__m128i*)A.ptr); 626 __m128i mmB = _mm_loadu_si128(cast(__m128i*)B.ptr); 627 assert(0 == _mm_cmpistrc!(_SIDD_UBYTE_OPS // match gives 0 like strcmp 628 | _SIDD_CMP_EQUAL_EACH 629 | _SIDD_NEGATIVE_POLARITY)(mmA, mmA)); 630 assert(1 == _mm_cmpistrc!(_SIDD_UBYTE_OPS 631 | _SIDD_CMP_EQUAL_EACH 632 | _SIDD_NEGATIVE_POLARITY)(mmA, mmB)); 633 } 634 } 635 636 /// Compare packed strings with implicit lengths in `a` and `b` using the control in `imm8` 637 /// and return the generated index. 638 /// Note: if the mask is all zeroes, the returned index is always `Count` 639 /// (8 or 16 depending on size). 640 int _mm_cmpistri(int imm8)(__m128i a, __m128i b) @trusted 641 { 642 static if (GDC_with_SSE42) 643 { 644 return __builtin_ia32_pcmpistri128(cast(ubyte16)a, cast(ubyte16)b, imm8); 645 } 646 else static if (LDC_with_SSE42) 647 { 648 return __builtin_ia32_pcmpistri128(cast(byte16)a, cast(byte16)b, imm8); 649 } 650 else 651 { 652 static if (imm8 & 1) 653 { 654 int la = findLengthShort(a); 655 int lb = findLengthShort(b); 656 } 657 else 658 { 659 int la = findLengthByte(a); 660 int lb = findLengthByte(b); 661 } 662 return _mm_cmpestri!imm8(a, la, b, lb); 663 } 664 } 665 unittest 666 { 667 // Identify the last character that isn't an identifier character. 668 // v (at index 7) 669 char[16] A = "my_i(en)ifie"; 670 char[16] identRanges = "__azAz09"; 671 __m128i mmA = _mm_loadu_si128(cast(__m128i*)A.ptr); 672 __m128i mmI = _mm_loadu_si128(cast(__m128i*)identRanges.ptr); 673 byte16 mask = cast(byte16)_mm_cmpistrm!(_SIDD_UBYTE_OPS 674 | _SIDD_CMP_RANGES 675 | _SIDD_MASKED_NEGATIVE_POLARITY 676 | _SIDD_UNIT_MASK)(mmI, mmA); 677 byte[16] correctM = [0, 0, 0, 0, -1, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0]; 678 assert(mask.array == correctM); 679 680 int index = _mm_cmpistri!(_SIDD_UBYTE_OPS 681 | _SIDD_CMP_RANGES 682 | _SIDD_MASKED_NEGATIVE_POLARITY 683 | _SIDD_MOST_SIGNIFICANT)(mmI, mmA); 684 assert(index == 7); // ')' is the last char not to be in [__azAz09] 685 } 686 687 /// Compare packed strings with implicit lengths in `a` and `b` using the control in 688 /// `imm8`, and return the generated mask. 689 __m128i _mm_cmpistrm(int imm8)(__m128i a, __m128i b) @trusted 690 { 691 static if (GDC_with_SSE42) 692 { 693 return cast(__m128i) __builtin_ia32_pcmpistrm128(cast(ubyte16)a, cast(ubyte16)b, imm8); 694 } 695 else static if (LDC_with_SSE42) 696 { 697 return cast(__m128i) __builtin_ia32_pcmpistrm128(cast(byte16)a, cast(byte16)b, imm8); 698 } 699 else 700 { 701 static if (imm8 & 1) 702 { 703 int la = findLengthShort(a); 704 int lb = findLengthShort(b); 705 } 706 else 707 { 708 int la = findLengthByte(a); 709 int lb = findLengthByte(b); 710 } 711 return _mm_cmpestrm!imm8(a, la, b, lb); 712 } 713 } 714 unittest 715 { 716 char[16] A = "Hello world!"; 717 char[16] B = "aeiou!"; 718 __m128i mmA = _mm_loadu_si128(cast(__m128i*)A.ptr); 719 __m128i mmB = _mm_loadu_si128(cast(__m128i*)B.ptr); 720 721 // Find which letters from B where found in A. 722 byte16 R = cast(byte16)_mm_cmpistrm!(_SIDD_UBYTE_OPS 723 | _SIDD_CMP_EQUAL_ANY 724 | _SIDD_BIT_MASK)(mmA, mmB); 725 // because 'e', 'o', and '!' were found 726 byte[16] correctR = [42, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]; 727 assert(R.array == correctR); 728 byte16 M = cast(byte16) _mm_cmpistrm!(_SIDD_UBYTE_OPS 729 | _SIDD_CMP_EQUAL_ANY 730 | _SIDD_UNIT_MASK)(mmA, mmB); 731 byte[16] correctM = [0, -1, 0, -1, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]; 732 assert(M.array == correctM); 733 } 734 735 /// Compare packed strings in `a` and `b` with lengths `la` and `lb` using 736 /// the control in `imm8`, and returns bit 0 of the resulting bit mask. 737 int _mm_cmpistro(int imm8)(__m128i a, __m128i b) @trusted 738 { 739 static if (GDC_with_SSE42) 740 { 741 return __builtin_ia32_pcmpistrio128(cast(ubyte16)a, cast(ubyte16)b, imm8); 742 } 743 else static if (LDC_with_SSE42) 744 { 745 return __builtin_ia32_pcmpistrio128(cast(byte16)a, cast(byte16)b, imm8); 746 } 747 else 748 { 749 static if (imm8 & 1) 750 { 751 int la = findLengthShort(a); 752 int lb = findLengthShort(b); 753 } 754 else 755 { 756 int la = findLengthByte(a); 757 int lb = findLengthByte(b); 758 } 759 return _mm_cmpestro!imm8(a, la, b, lb); 760 } 761 } 762 unittest 763 { 764 char[16] A = "Hallo world!"; 765 char[16] B = "aeiou!"; 766 char[16] C = "Z"; 767 __m128i mmA = _mm_loadu_si128(cast(__m128i*)A.ptr); 768 __m128i mmB = _mm_loadu_si128(cast(__m128i*)B.ptr); 769 __m128i mmC = _mm_loadu_si128(cast(__m128i*)C.ptr); 770 771 // Find which letters from B where found in A. 772 int res = _mm_cmpistro!(_SIDD_UBYTE_OPS 773 | _SIDD_CMP_EQUAL_ANY 774 | _SIDD_BIT_MASK)(mmA, mmB); 775 // because 'a' was found in "Hallo world!" 776 assert(res == 1); 777 res = _mm_cmpistro!(_SIDD_UBYTE_OPS 778 | _SIDD_CMP_EQUAL_ANY 779 | _SIDD_BIT_MASK)(mmA, mmC); 780 assert(res == 0); // because 'Z' wasn't found in A 781 } 782 783 /// Returns 1 if any character in `a` was null, and 0 otherwise. 784 int _mm_cmpistrs(int imm8)(__m128i a, __m128i b) @trusted 785 { 786 static if (GDC_with_SSE42) 787 { 788 return __builtin_ia32_pcmpistris128(cast(ubyte16)a, cast(ubyte16)b, imm8); 789 } 790 else static if (LDC_with_SSE42) 791 { 792 return __builtin_ia32_pcmpistris128(cast(byte16)a, cast(byte16)b, imm8); 793 } 794 else 795 { 796 static if (imm8 & 1) 797 { 798 int la = findLengthShort(a); 799 return la != 8; 800 } 801 else 802 { 803 int la = findLengthByte(a); 804 return la != 16; 805 } 806 } 807 } 808 unittest 809 { 810 char[16] A = ""; 811 char[16] B = "hello"; 812 char[16] C = "Maximum length!!"; 813 __m128i mmA = _mm_loadu_si128(cast(__m128i*)A.ptr); 814 __m128i mmB = _mm_loadu_si128(cast(__m128i*)B.ptr); 815 __m128i mmC = _mm_loadu_si128(cast(__m128i*)C.ptr); 816 assert(_mm_cmpistrs!_SIDD_UBYTE_OPS(mmA, mmA) == 1); 817 assert(_mm_cmpistrs!_SIDD_SBYTE_OPS(mmB, mmB) == 1); 818 assert(_mm_cmpistrs!_SIDD_UWORD_OPS(mmC, mmC) == 0); 819 } 820 821 /// Returns 1 if any character in `b` was null, and 0 otherwise. 822 int _mm_cmpistrz(int imm8)(__m128i a, __m128i b) @trusted 823 { 824 static if (GDC_with_SSE42) 825 { 826 return __builtin_ia32_pcmpistriz128(cast(ubyte16)a, cast(ubyte16)b, imm8); 827 } 828 else static if (LDC_with_SSE42) 829 { 830 return __builtin_ia32_pcmpistriz128(cast(byte16)a, cast(byte16)b, imm8); 831 } 832 else 833 { 834 static if (imm8 & 1) 835 { 836 int lb = findLengthShort(b); 837 return lb != 8; 838 } 839 else 840 { 841 int lb = findLengthByte(b); 842 return lb != 16; 843 } 844 } 845 } 846 unittest 847 { 848 char[16] A = ""; 849 char[16] B = "hello"; 850 char[16] C = "Maximum length!!"; 851 __m128i mmA = _mm_loadu_si128(cast(__m128i*)A.ptr); 852 __m128i mmB = _mm_loadu_si128(cast(__m128i*)B.ptr); 853 __m128i mmC = _mm_loadu_si128(cast(__m128i*)C.ptr); 854 assert(_mm_cmpistrz!_SIDD_UBYTE_OPS(mmC, mmA) == 1); 855 assert(_mm_cmpistrz!_SIDD_SBYTE_OPS(mmC, mmB) == 1); 856 assert(_mm_cmpistrz!_SIDD_UWORD_OPS(mmA, mmC) == 0); 857 } 858 859 860 /// Starting with the initial value in `crc`, accumulates a CR32 value 861 /// for unsigned 16-bit integer `v`. 862 /// Warning: this is computing CRC-32C (Castagnoli), not CRC-32. 863 uint _mm_crc32_u16 (uint crc, ushort v) @safe 864 { 865 static if (GDC_with_SSE42) 866 { 867 return __builtin_ia32_crc32hi(crc, v); 868 } 869 else static if (LDC_with_SSE42) 870 { 871 return __builtin_ia32_crc32hi(crc, v); 872 } 873 else static if (LDC_with_ARM64_CRC) 874 { 875 return __crc32ch(crc, v); 876 } 877 else 878 { 879 crc = _mm_crc32_u8(crc, v & 0xff); 880 crc = _mm_crc32_u8(crc, v >> 8); 881 return crc; 882 } 883 } 884 unittest 885 { 886 uint A = _mm_crc32_u16(0x12345678, 0x4512); 887 uint B = _mm_crc32_u16(0x76543210, 0xf50f); 888 uint C = _mm_crc32_u16(0xDEADBEEF, 0x0017); 889 assert(A == 0x39c3f0ff); 890 assert(B == 0xcffbcf07); 891 assert(C == 0xc7e3fe85); 892 } 893 894 /// Starting with the initial value in `crc`, accumulates a CRC32 value 895 /// for unsigned 32-bit integer `v`. 896 /// Warning: this is computing CRC-32C (Castagnoli), not CRC-32. 897 uint _mm_crc32_u32 (uint crc, uint v) @safe 898 { 899 static if (GDC_with_SSE42) 900 { 901 return __builtin_ia32_crc32si(crc, v); 902 } 903 else static if (LDC_with_SSE42) 904 { 905 return __builtin_ia32_crc32si(crc, v); 906 } 907 else static if (LDC_with_ARM64_CRC) 908 { 909 return __crc32cw(crc, v); 910 } 911 else 912 { 913 crc = _mm_crc32_u8(crc, v & 0xff); 914 crc = _mm_crc32_u8(crc, (v >> 8) & 0xff); 915 crc = _mm_crc32_u8(crc, (v >> 16) & 0xff); 916 crc = _mm_crc32_u8(crc, (v >> 24) & 0xff); 917 return crc; 918 } 919 } 920 unittest 921 { 922 uint A = _mm_crc32_u32(0x12345678, 0x45123563); 923 uint B = _mm_crc32_u32(0x76543210, 0xf50f9993); 924 uint C = _mm_crc32_u32(0xDEADBEEF, 0x00170017); 925 assert(A == 0x22a6ec54); 926 assert(B == 0x7019a6cf); 927 assert(C == 0xbc552c27); 928 } 929 930 /// Starting with the initial value in `crc`, accumulates a CRC32 931 /// value for unsigned 64-bit integer `v`. 932 /// Warning: this is computing CRC-32C (Castagnoli), not CRC-32. 933 ulong _mm_crc32_u64 (ulong crc, ulong v) 934 { 935 version(X86_64) 936 enum bool hasX86Intrin = GDC_with_SSE42 || LDC_with_SSE42; 937 else 938 enum bool hasX86Intrin = false; // intrinsics not available in 32-bit 939 940 static if (hasX86Intrin) 941 { 942 return __builtin_ia32_crc32di(crc, v); 943 } 944 else static if (LDC_with_ARM64_CRC) 945 { 946 return __crc32cd(cast(uint)crc, v); 947 } 948 else 949 { 950 uint crc32 = cast(uint)crc; 951 crc32 = _mm_crc32_u8(crc32, (v >> 0) & 0xff); 952 crc32 = _mm_crc32_u8(crc32, (v >> 8) & 0xff); 953 crc32 = _mm_crc32_u8(crc32, (v >> 16) & 0xff); 954 crc32 = _mm_crc32_u8(crc32, (v >> 24) & 0xff); 955 crc32 = _mm_crc32_u8(crc32, (v >> 32) & 0xff); 956 crc32 = _mm_crc32_u8(crc32, (v >> 40) & 0xff); 957 crc32 = _mm_crc32_u8(crc32, (v >> 48) & 0xff); 958 crc32 = _mm_crc32_u8(crc32, (v >> 56) & 0xff); 959 return crc32; 960 } 961 } 962 unittest 963 { 964 ulong A = _mm_crc32_u64(0x1234567812345678, 0x39C3F0FFCFFBCF07); 965 ulong B = _mm_crc32_u64(0x7654321001234567, 0xFACEFEED); 966 ulong C = _mm_crc32_u64(0xDEADBEEFCAFEBABE, 0x0017C7E3FE850017); 967 assert(A == 0xd66b1074); 968 assert(B == 0xac12f9c6); 969 assert(C == 0xa2d13dd8); 970 } 971 972 /// Starting with the initial value in `crc`, accumulates a CRC32 value 973 /// for unsigned 8-bit integer `v`. 974 /// Warning: this is computing CRC-32C (Castagnoli), not CRC-32. 975 uint _mm_crc32_u8 (uint crc, ubyte v) @safe 976 { 977 static if (GDC_with_SSE42) 978 { 979 return __builtin_ia32_crc32qi(crc, v); 980 } 981 else static if (LDC_with_SSE42) 982 { 983 return __builtin_ia32_crc32qi(crc, v); 984 } 985 else static if (LDC_with_ARM64_CRC) 986 { 987 return __crc32cb(crc, v); 988 } 989 else 990 { 991 return CRC32cTable[(crc ^ v) & 0xFF] ^ (crc >> 8); 992 } 993 } 994 unittest 995 { 996 uint A = _mm_crc32_u8(0x12345678, 0x45); 997 uint B = _mm_crc32_u8(0x76543210, 0xf5); 998 uint C = _mm_crc32_u8(0xDEADBEEF, 0x00); 999 assert(A == 0x8fd93134); 1000 assert(B == 0xd6b7e834); 1001 assert(C == 0xbdfd3980); 1002 } 1003 1004 1005 // Utilities for this file 1006 1007 private: 1008 1009 static if (GDC_with_SSE42) 1010 { 1011 version(X86_64) 1012 enum bool NeedCRC32CTable = false; 1013 else 1014 enum bool NeedCRC32CTable = true; 1015 } 1016 else static if (LDC_with_SSE42) 1017 { 1018 version(X86_64) 1019 enum bool NeedCRC32CTable = false; 1020 else 1021 enum bool NeedCRC32CTable = true; 1022 } 1023 else static if (LDC_with_ARM64_CRC) 1024 { 1025 enum bool NeedCRC32CTable = false; 1026 } 1027 else 1028 { 1029 enum bool NeedCRC32CTable = true; 1030 } 1031 1032 static if (NeedCRC32CTable) 1033 { 1034 static immutable uint[256] CRC32cTable = 1035 [ 1036 0x0, 0xf26b8303, 0xe13b70f7, 0x1350f3f4, 0xc79a971f, 0x35f1141c, 0x26a1e7e8, 0xd4ca64eb, 1037 0x8ad958cf, 0x78b2dbcc, 0x6be22838, 0x9989ab3b, 0x4d43cfd0, 0xbf284cd3, 0xac78bf27, 0x5e133c24, 1038 0x105ec76f, 0xe235446c, 0xf165b798, 0x30e349b, 0xd7c45070, 0x25afd373, 0x36ff2087, 0xc494a384, 1039 0x9a879fa0, 0x68ec1ca3, 0x7bbcef57, 0x89d76c54, 0x5d1d08bf, 0xaf768bbc, 0xbc267848, 0x4e4dfb4b, 1040 0x20bd8ede, 0xd2d60ddd, 0xc186fe29, 0x33ed7d2a, 0xe72719c1, 0x154c9ac2, 0x61c6936, 0xf477ea35, 1041 0xaa64d611, 0x580f5512, 0x4b5fa6e6, 0xb93425e5, 0x6dfe410e, 0x9f95c20d, 0x8cc531f9, 0x7eaeb2fa, 1042 0x30e349b1, 0xc288cab2, 0xd1d83946, 0x23b3ba45, 0xf779deae, 0x5125dad, 0x1642ae59, 0xe4292d5a, 1043 0xba3a117e, 0x4851927d, 0x5b016189, 0xa96ae28a, 0x7da08661, 0x8fcb0562, 0x9c9bf696, 0x6ef07595, 1044 0x417b1dbc, 0xb3109ebf, 0xa0406d4b, 0x522bee48, 0x86e18aa3, 0x748a09a0, 0x67dafa54, 0x95b17957, 1045 0xcba24573, 0x39c9c670, 0x2a993584, 0xd8f2b687, 0xc38d26c, 0xfe53516f, 0xed03a29b, 0x1f682198, 1046 0x5125dad3, 0xa34e59d0, 0xb01eaa24, 0x42752927, 0x96bf4dcc, 0x64d4cecf, 0x77843d3b, 0x85efbe38, 1047 0xdbfc821c, 0x2997011f, 0x3ac7f2eb, 0xc8ac71e8, 0x1c661503, 0xee0d9600, 0xfd5d65f4, 0xf36e6f7, 1048 0x61c69362, 0x93ad1061, 0x80fde395, 0x72966096, 0xa65c047d, 0x5437877e, 0x4767748a, 0xb50cf789, 1049 0xeb1fcbad, 0x197448ae, 0xa24bb5a, 0xf84f3859, 0x2c855cb2, 0xdeeedfb1, 0xcdbe2c45, 0x3fd5af46, 1050 0x7198540d, 0x83f3d70e, 0x90a324fa, 0x62c8a7f9, 0xb602c312, 0x44694011, 0x5739b3e5, 0xa55230e6, 1051 0xfb410cc2, 0x92a8fc1, 0x1a7a7c35, 0xe811ff36, 0x3cdb9bdd, 0xceb018de, 0xdde0eb2a, 0x2f8b6829, 1052 0x82f63b78, 0x709db87b, 0x63cd4b8f, 0x91a6c88c, 0x456cac67, 0xb7072f64, 0xa457dc90, 0x563c5f93, 1053 0x82f63b7, 0xfa44e0b4, 0xe9141340, 0x1b7f9043, 0xcfb5f4a8, 0x3dde77ab, 0x2e8e845f, 0xdce5075c, 1054 0x92a8fc17, 0x60c37f14, 0x73938ce0, 0x81f80fe3, 0x55326b08, 0xa759e80b, 0xb4091bff, 0x466298fc, 1055 0x1871a4d8, 0xea1a27db, 0xf94ad42f, 0xb21572c, 0xdfeb33c7, 0x2d80b0c4, 0x3ed04330, 0xccbbc033, 1056 0xa24bb5a6, 0x502036a5, 0x4370c551, 0xb11b4652, 0x65d122b9, 0x97baa1ba, 0x84ea524e, 0x7681d14d, 1057 0x2892ed69, 0xdaf96e6a, 0xc9a99d9e, 0x3bc21e9d, 0xef087a76, 0x1d63f975, 0xe330a81, 0xfc588982, 1058 0xb21572c9, 0x407ef1ca, 0x532e023e, 0xa145813d, 0x758fe5d6, 0x87e466d5, 0x94b49521, 0x66df1622, 1059 0x38cc2a06, 0xcaa7a905, 0xd9f75af1, 0x2b9cd9f2, 0xff56bd19, 0xd3d3e1a, 0x1e6dcdee, 0xec064eed, 1060 0xc38d26c4, 0x31e6a5c7, 0x22b65633, 0xd0ddd530, 0x417b1db, 0xf67c32d8, 0xe52cc12c, 0x1747422f, 1061 0x49547e0b, 0xbb3ffd08, 0xa86f0efc, 0x5a048dff, 0x8ecee914, 0x7ca56a17, 0x6ff599e3, 0x9d9e1ae0, 1062 0xd3d3e1ab, 0x21b862a8, 0x32e8915c, 0xc083125f, 0x144976b4, 0xe622f5b7, 0xf5720643, 0x7198540, 1063 0x590ab964, 0xab613a67, 0xb831c993, 0x4a5a4a90, 0x9e902e7b, 0x6cfbad78, 0x7fab5e8c, 0x8dc0dd8f, 1064 0xe330a81a, 0x115b2b19, 0x20bd8ed, 0xf0605bee, 0x24aa3f05, 0xd6c1bc06, 0xc5914ff2, 0x37faccf1, 1065 0x69e9f0d5, 0x9b8273d6, 0x88d28022, 0x7ab90321, 0xae7367ca, 0x5c18e4c9, 0x4f48173d, 0xbd23943e, 1066 0xf36e6f75, 0x105ec76, 0x12551f82, 0xe03e9c81, 0x34f4f86a, 0xc69f7b69, 0xd5cf889d, 0x27a40b9e, 1067 0x79b737ba, 0x8bdcb4b9, 0x988c474d, 0x6ae7c44e, 0xbe2da0a5, 0x4c4623a6, 0x5f16d052, 0xad7d5351, 1068 ]; 1069 } 1070 1071 int findLengthByte(__m128i a) pure @safe 1072 { 1073 const __m128i zero = _mm_setzero_si128(); 1074 const __m128i zeroMask = _mm_cmpeq_epi8(a, zero); // 0xff where a byte is zero 1075 int mask = _mm_movemask_epi8(zeroMask); // the lowest set bit is the zero index 1076 if (mask == 0) 1077 return 16; 1078 else 1079 return bsf(mask); 1080 } 1081 unittest 1082 { 1083 char[16] A = "Hel!o"; 1084 char[16] B = "Maximum length!!"; 1085 __m128i mmA = _mm_loadu_si128(cast(__m128i*)A.ptr); 1086 __m128i mmB = _mm_loadu_si128(cast(__m128i*)B.ptr); 1087 assert(findLengthByte(mmA) == 5); 1088 assert(findLengthByte(mmB) == 16); 1089 } 1090 1091 int findLengthShort(__m128i a) pure @safe 1092 { 1093 const __m128i zero = _mm_setzero_si128(); 1094 const __m128i zeroMask = _mm_cmpeq_epi16(a, zero); // 0xffff where a short is zero 1095 int mask = _mm_movemask_epi8(zeroMask); // the lowest set bit is the zero index 1096 if (mask == 0) 1097 return 8; 1098 else 1099 return bsf(mask) >> 1; 1100 } 1101 unittest 1102 { 1103 short[8] A = [10, 5423, 475, 0, 1, 1, 1, 1 ]; 1104 short[8] B = [-1, -2, -3, 4, 5, 6, -32768, 1]; 1105 __m128i mmA = _mm_loadu_si128(cast(__m128i*)A.ptr); 1106 __m128i mmB = _mm_loadu_si128(cast(__m128i*)B.ptr); 1107 assert(findLengthShort(mmA) == 3); 1108 assert(findLengthShort(mmB) == 8); 1109 } 1110 1111 static immutable byte[32] MASK_DATA = 1112 [ 1113 -1, -1, -1, -1, -1, -1, -1, -1, 1114 -1, -1, -1, -1, -1, -1, -1, -1, 1115 0, 0, 0, 0, 0, 0, 0, 0, 1116 0, 0, 0, 0, 0, 0, 0, 0, 1117 ]; 1118 1119 // Makes a byte validity mask with a given explicit length string. 1120 __m128i validMask8e(int len) @trusted 1121 { 1122 return _mm_loadu_si128(cast(__m128i*) &MASK_DATA[16-len]); 1123 } 1124 unittest 1125 { 1126 char[16] A = ""; 1127 char[16] B = "0123456789abcdef"; 1128 byte[16] correctA = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]; 1129 byte[16] correctB = [-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1]; 1130 byte16 MA = cast(byte16) validMask8e(0); 1131 byte16 MB = cast(byte16) validMask8e(16); 1132 assert(MA.array == correctA); 1133 assert(MB.array == correctB); 1134 } 1135 1136 // Makes a short validity mask with a given explicit length string. 1137 __m128i validMask16e(int len) @trusted 1138 { 1139 return _mm_loadu_si128(cast(__m128i*) &MASK_DATA[16-len*2]); 1140 } 1141 unittest 1142 { 1143 short[8] A = [3, 4, 5, 0, 3, 4, 5, 6]; 1144 short[8] correctA = [-1, -1, -1, 0, 0, 0, 0, 0]; 1145 short8 MA = cast(short8) validMask16e(3); 1146 assert(MA.array == correctA); 1147 } 1148 1149 // Internal implementation for non-SSE4.2 1150 // Compare 8-bit or 16-bit strings, get a mask. 1151 // `aValid` and `bValid` are byte-mask or word-mask of the valid 1152 // zone in `a` and `b`. 1153 __m128i cmpstrMaskExplicit(int imm8)(__m128i a, 1154 ref int la, 1155 __m128i b, 1156 ref int lb) @safe 1157 { 1158 // saturates lengths (the Intrinsics Guide doesn't tell this) 1159 if (la < 0) la = -la; 1160 if (lb < 0) lb = -lb; 1161 if (la > 16) la = 16; 1162 if (lb > 16) lb = 16; 1163 1164 static if (imm8 & 1) 1165 { 1166 __m128i aValid = validMask16e(la); 1167 __m128i bValid = validMask16e(lb); 1168 } 1169 else 1170 { 1171 __m128i aValid = validMask8e(la); 1172 __m128i bValid = validMask8e(lb); 1173 } 1174 return cmpstrMask!imm8(a, aValid, b, bValid); 1175 } 1176 1177 //ditto 1178 __m128i cmpstrMask(int imm8)(__m128i a, 1179 __m128i aValid, 1180 __m128i b, 1181 const __m128i bValid) @safe 1182 { 1183 enum bool chars16Bits = imm8 & 1; 1184 enum int Mode = (imm8 >> 2) & 3; 1185 1186 static if (Mode == 0) // equal any 1187 { 1188 __m128i R = _mm_setzero_si128(); 1189 static if (chars16Bits) // 64 comparisons 1190 { 1191 for (int k = 0; k < 8; ++k) 1192 { 1193 __m128i equalMask = _mm_cmpeq_epi16(a, b); 1194 equalMask = _mm_and_si128(equalMask, aValid); 1195 R = _mm_or_si128(R, equalMask); 1196 1197 // rotate a and aValid 1198 a = _mm_or_si128(_mm_srli_si128!2(a), _mm_slli_si128!14(a)); 1199 aValid = _mm_or_si128(_mm_srli_si128!2(aValid), _mm_slli_si128!14(aValid)); 1200 } 1201 } 1202 else 1203 { 1204 for (int k = 0; k < 16; ++k) 1205 { 1206 __m128i equalMask = _mm_cmpeq_epi8(a, b); 1207 equalMask = _mm_and_si128(equalMask, aValid); 1208 R = _mm_or_si128(R, equalMask); 1209 1210 // rotate a and aValid 1211 a = _mm_or_si128(_mm_srli_si128!1(a), _mm_slli_si128!15(a)); 1212 aValid = _mm_or_si128(_mm_srli_si128!1(aValid), _mm_slli_si128!15(aValid)); 1213 } 1214 } 1215 R = _mm_and_si128(R, bValid); 1216 } 1217 else static if (Mode == 1) // ranges 1218 { 1219 enum bool signed = (imm8 & 2) != 0; 1220 1221 // For each character in b, the returned mask says if it was found in a range-pair in `a`. 1222 __m128i R = _mm_setzero_si128(); 1223 static if (chars16Bits) 1224 { 1225 for (int pos = 0; pos < 8; pos += 2) 1226 { 1227 short min = (cast(short8)a).array[pos]; 1228 short max = (cast(short8)a).array[pos+1]; 1229 static if (signed) 1230 { 1231 __m128i ge = ~_mm_cmplt_epi16(b, _mm_set1_epi16(min)); 1232 __m128i le = ~_mm_cmpgt_epi16(b, _mm_set1_epi16(max)); 1233 } 1234 else 1235 { 1236 // No SSE way to do 16-bit unsigned comparisons, 1237 // but flipping the sign bit let us used signed comp 1238 __m128i firstBits = _mm_set1_epi16(-32768); 1239 __m128i reverseB = _mm_xor_si128(b, firstBits); 1240 __m128i reverseMin = _mm_xor_si128(_mm_set1_epi16(min), firstBits); 1241 __m128i reverseMax = _mm_xor_si128(_mm_set1_epi16(max), firstBits); 1242 __m128i ge = ~_mm_cmplt_epi16(reverseB, reverseMin); 1243 __m128i le = ~_mm_cmpgt_epi16(reverseB, reverseMax); 1244 } 1245 __m128i inRange = _mm_and_si128(le, ge); 1246 1247 // Not considered in range a is invalid here. 1248 short aValidHere = (cast(short8)aValid).array[pos+1]; 1249 __m128i mmAValidHere = _mm_set1_epi16(aValidHere); 1250 inRange = _mm_and_si128(inRange, mmAValidHere); 1251 1252 R = _mm_or_si128(R, inRange); 1253 } 1254 } 1255 else // 8-bits 1256 { 1257 for (int pos = 0; pos < 16; pos += 2) 1258 { 1259 byte min = (cast(byte16)a).array[pos]; 1260 byte max = (cast(byte16)a).array[pos+1]; 1261 static if (signed) 1262 { 1263 __m128i ge = _mm_xor_si128(_mm_cmplt_epi8(b, _mm_set1_epi8(min))); 1264 __m128i le = _mm_xor_si128(_mm_cmpgt_epi8(b, _mm_set1_epi8(max))); 1265 } 1266 else 1267 { 1268 // No SSE way to do 16-bit unsigned comparisons, 1269 // but flipping the sign bit let us used signed comp 1270 __m128i firstBits = _mm_set1_epi8(-128); 1271 __m128i reverseB = _mm_xor_si128(b, firstBits); 1272 __m128i reverseMin = _mm_xor_si128(_mm_set1_epi8(min), firstBits); 1273 __m128i reverseMax = _mm_xor_si128(_mm_set1_epi8(max), firstBits); 1274 __m128i ge = ~_mm_cmplt_epi8(reverseB, reverseMin); 1275 __m128i le = ~_mm_cmpgt_epi8(reverseB, reverseMax); 1276 } 1277 __m128i inRange = _mm_and_si128(le, ge); 1278 1279 // Not considered in range a is invalid here. 1280 byte aValidHere = (cast(byte16)aValid).array[pos+1]; 1281 __m128i mmAValidHere = _mm_set1_epi8(aValidHere); 1282 inRange = _mm_and_si128(inRange, mmAValidHere); 1283 1284 R = _mm_or_si128(R, inRange); 1285 } 1286 } 1287 // invalid b part is not in range 1288 R = _mm_and_si128(R, bValid); 1289 } 1290 else static if (Mode == 2) // equal each, just 16 comparisons not 256 1291 { 1292 static if (chars16Bits) 1293 { 1294 __m128i R = _mm_cmpeq_epi16(a, b); 1295 } 1296 else 1297 { 1298 __m128i R = _mm_cmpeq_epi8(a, b); 1299 } 1300 1301 // if only a or b is invalid, consider not equal 1302 R = _mm_andnot_si128(_mm_xor_si128(aValid, bValid), R); 1303 1304 // if a and b are both invalid, consider equal 1305 R = _mm_or_si128(R, ~_mm_or_si128(aValid, bValid)); 1306 } 1307 else static if (Mode == 3) // equal ordered 1308 { 1309 // a is searched in b. 1310 1311 __m128i bValidShift = bValid; 1312 1313 __m128i R = _mm_set1_epi32(-1); // all b positions possible for containing a 1314 static if (chars16Bits) 1315 { 1316 for (int pos = 0; pos < 8; ++pos) 1317 { 1318 // compare character k of a, where can it go in b? 1319 short charK = (cast(short8)a).array[pos]; 1320 __m128i mmcharK = _mm_set1_epi16(charK); 1321 1322 short aValidHere = (cast(short8)aValid).array[pos]; 1323 __m128i mmAValidHere = _mm_set1_epi16(aValidHere); 1324 __m128i mmAInvalidHere = _mm_xor_si128(mmAValidHere, _mm_set1_epi32(-1)); 1325 __m128i equalMask = _mm_cmpeq_epi16(mmcharK, b); 1326 1327 // Where A is invalid, the comparison always holds "equal" 1328 equalMask = _mm_or_si128(equalMask, mmAInvalidHere); 1329 1330 // Where B is invalid, and A is valid, the comparison is forced to false 1331 equalMask = _mm_and_si128(equalMask, _mm_or_si128(bValidShift, mmAInvalidHere)); 1332 1333 R = _mm_and_si128(equalMask); 1334 1335 // drop first char of b 1336 b = _mm_srli_si128!2(b); 1337 bValidShift = _mm_srli_si128!2(bValidShift); 1338 } 1339 } 1340 else 1341 { 1342 for (int pos = 0; pos < 16; ++pos) 1343 { 1344 // compare character k of a, where can it go in b? 1345 byte charK = (cast(byte16)a).array[pos]; 1346 __m128i mmcharK = _mm_set1_epi8(charK); 1347 1348 byte aValidHere = (cast(byte16)aValid).array[pos]; 1349 __m128i mmAValidHere = _mm_set1_epi8(aValidHere); 1350 __m128i mmAInvalidHere = _mm_xor_si128(mmAValidHere, _mm_set1_epi32(-1)); 1351 __m128i equalMask = _mm_cmpeq_epi8(mmcharK, b); 1352 1353 // Where A is invalid, the comparison always holds "equal" 1354 equalMask = _mm_or_si128(equalMask, mmAInvalidHere); 1355 1356 // Where B is invalid, and A is valid, the comparison is forced to false 1357 equalMask = _mm_and_si128(equalMask, _mm_or_si128(bValidShift, mmAInvalidHere)); 1358 1359 R = _mm_and_si128(R, equalMask); 1360 1361 // drop first char of b 1362 b = _mm_srli_si128!1(b); 1363 bValidShift = _mm_srli_si128!1(bValidShift); 1364 } 1365 } 1366 } 1367 else 1368 static assert(0); 1369 1370 // Optionally negate result 1371 static if (imm8 & _SIDD_NEGATIVE_POLARITY) 1372 { 1373 static if (imm8 & _SIDD_MASKED_POSITIVE_POLARITY) 1374 { 1375 R = _mm_xor_si128(R, bValid); // only negate valid b 1376 } 1377 else 1378 { 1379 R = _mm_xor_si128(R, _mm_set1_epi32(-1)); // negate all 1380 } 1381 } 1382 return R; 1383 }