1 /**
2 * SSE4.2 intrinsics.
3 * https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#techs=SSSE3
4 *
5 * Copyright: Guillaume Piolat 2022.
6 * License:   $(LINK2 http://www.boost.org/LICENSE_1_0.txt, Boost License 1.0)
7 */
8 module inteli.nmmintrin;
9 
10 public import inteli.types;
11 import inteli.internals;
12 public import inteli.smmintrin;
13 import core.bitop: bsf, bsr;
14 
15 
16 // Note: this header will work whether you have SSE4.2 enabled or not.
17 // With LDC, use "dflags-ldc": ["-mattr=+sse4.2"] or equivalent to actively 
18 // generate SSE4.2 instruction (they are often enabled with -O1 or greater).
19 // - Additionally, you need ["-mattr=+crc"] on ARM if you want hardware CRC instructions.
20 // - Since LDC 1.30, you need ["-mattr=+crc32"] on x86_64 if you want hardware CRC instructions,
21 //   it is not considered implied by sse4.2 anymore.
22 // With GDC, use "dflags-gdc": ["-msse4.2"] or equivalent to generate SSE4.2 instructions.
23 
24 nothrow @nogc:
25 
26 // <Data size and signedness>
27 
28 /// String contains unsigned 8-bit characters (default).
29 enum int _SIDD_UBYTE_OPS = 0;
30 
31 /// String contains unsigned 16-bit characters.
32 enum int _SIDD_UWORD_OPS = 1;
33 
34 /// String contains signed 8-bit characters.
35 enum int _SIDD_SBYTE_OPS = 2;
36 
37 /// String contains signed 16-bit characters.
38 enum int _SIDD_SWORD_OPS = 3;
39 
40 // </Data size and signedness>
41 
42 
43 // <Comparison options>
44 
45 /// For each character in `b`, find if it is in `a` (default)
46 /// The resulting mask has bit set at b positions that were found in a.
47 enum int _SIDD_CMP_EQUAL_ANY = 0;
48 
49 /// For each character in `b`, determine if
50 /// `a[0] <= c <= a[1] or a[1] <= c <= a[2]...`
51 /// Contrarily to false documentation on the Internet, pairs must be in `a`!
52 enum int _SIDD_CMP_RANGES = 4;
53 
54 /// The strings defined by `a` and `b` are equal
55 enum int _SIDD_CMP_EQUAL_EACH = 8;
56 
57 /// Search for the defined substring in the target
58 enum int _SIDD_CMP_EQUAL_ORDERED = 12;
59 
60 // </Comparison options>
61 
62 // <Result polarity>
63 
64 /// Do not negate results (default, no effect)
65 enum int _SIDD_POSITIVE_POLARITY = 0;
66 
67 /// Negates results
68 enum int _SIDD_NEGATIVE_POLARITY = 16;
69 
70 /// No effect. Do not negate results before the end of the string. (default when using `_SIDD_NEGATIVE_POLARITY`)
71 /// You basically never want this.
72 enum int _SIDD_MASKED_POSITIVE_POLARITY = 32;
73 
74 /// Negates results only before the end of the string
75 enum int _SIDD_MASKED_NEGATIVE_POLARITY = 48;
76 
77 // </Result polarity>
78 
79 // <Bit returned>
80 
81 /// **Index only**: return the least significant bit (default).
82 enum int _SIDD_LEAST_SIGNIFICANT = 0;
83 
84 /// **Index only**: return the most significant bit.
85 enum int _SIDD_MOST_SIGNIFICANT = 64;
86 
87 // </Bit returned>
88 
89 /// **Mask only**: return the bit mask (default).
90 enum int _SIDD_BIT_MASK = 0;
91 
92 /// **Mask only**: return the byte/word mask.
93 enum int _SIDD_UNIT_MASK = 64;
94 
95 /// So SSE4.2 has a lot of hard-to-understand instructions. Here is another explanation.
96 ///
97 /// Alternative explanation of imm8
98 ///
99 /// imm8 is an 8-bit immediate operand specifying whether the characters are bytes or
100 ///    words and the type of comparison to do.
101 ///
102 ///    Bits [1:0]: Determine source data format.
103 ///      00: 16 unsigned bytes
104 ///      01: 8 unsigned words
105 ///      10: 16 signed bytes
106 ///      11: 8 signed words
107 ///
108 ///    Bits [3:2]: Determine comparison type and aggregation method.
109 ///      00: Subset: Each character in B is compared for equality with all
110 ///          the characters in A.
111 ///      01: Ranges: Each character in B is compared to A pairs. The comparison
112 ///          basis is greater than or equal for even-indexed elements in A,
113 ///          and less than or equal for odd-indexed elements in A.
114 ///      10: Match: Compare each pair of corresponding characters in A and
115 ///          B for equality.
116 ///      11: Substring: Search B for substring matches of A.
117 ///
118 ///    Bits [5:4]: Determine whether to do a one's complement on the bit
119 ///                mask of the comparison results. \n
120 ///      00: No effect. \n
121 ///      01: Negate the bit mask. \n
122 ///      10: No effect. \n
123 ///      11: Negate the bit mask only for bits with an index less than or equal
124 ///          to the size of \a A or \a B.
125 ///
126 
127 
128 
129 /// Compare packed strings in `a` and `b` with lengths `la` and `lb` using 
130 /// the control in `imm8`, and returns 1 if `b` "does not contain a null character"
131 /// and the resulting mask was zero, and 0 otherwise.
132 /// Warning: actually it seems the instruction does accept \0 in input, just the length must be >= count.
133 ///          It's not clear for what purpose.
134 int _mm_cmpestra(int imm8)(__m128i a, int la, __m128i b, int lb) @trusted
135 {
136     static if (GDC_with_SSE42)
137     {
138         return cast(int) __builtin_ia32_pcmpestria128(cast(ubyte16)a, la, cast(ubyte16)b, lb, imm8);
139     }
140     else static if (LDC_with_SSE42)
141     {
142         return __builtin_ia32_pcmpestria128(cast(byte16)a, la, cast(byte16)b, lb, imm8);
143     }
144     else
145     {
146         __m128i mask = cmpstrMaskExplicit!imm8(a, la, b, lb);
147         __m128i equalZero = _mm_cmpeq_epi8(mask, _mm_setzero_si128());
148         int sigbits = _mm_movemask_epi8(equalZero);
149         enum int Count = (imm8 & 1) ? 8 : 16;
150         return (sigbits == 0xffff) && (lb >= Count);
151     }
152 }
153 unittest
154 {
155     char[16] A = "Maximum\x00length!!";
156     char[16] B = "Mbximum\x00length!!";
157     __m128i mmA = _mm_loadu_si128(cast(__m128i*)A.ptr);
158     __m128i mmB = _mm_loadu_si128(cast(__m128i*)B.ptr);
159 
160     // string matching a-la strcmp, for 16-bytes of data
161     // Use _SIDD_NEGATIVE_POLARITY since mask must be null, and all match must be one
162     assert(1 == _mm_cmpestra!(_SIDD_UBYTE_OPS 
163                             | _SIDD_CMP_EQUAL_EACH
164                             | _SIDD_NEGATIVE_POLARITY)(mmA, 16, mmA, 16));
165     assert(0 == _mm_cmpestra!(_SIDD_UBYTE_OPS 
166                             | _SIDD_CMP_EQUAL_EACH
167                             | _SIDD_NEGATIVE_POLARITY)(mmA, 16, mmB, 16));
168 
169     // test negative length, this will be clamped to 16
170     assert(1 == _mm_cmpestra!(_SIDD_UBYTE_OPS 
171                             | _SIDD_CMP_EQUAL_EACH
172                             | _SIDD_NEGATIVE_POLARITY)(mmA, -160, mmA, -17));
173 
174     // it seems you can't compare shorter strings for equality using _mm_cmpestra (!)
175 
176     // Test 16-bit format
177     assert(1 == _mm_cmpestra!(_SIDD_SWORD_OPS 
178                             | _SIDD_CMP_EQUAL_EACH
179                             | _SIDD_NEGATIVE_POLARITY)(mmA, 8, mmA, 8));
180 }
181 
182 /// Compare packed strings in `a` and `b` with lengths `la` and `lb` using 
183 /// the control in `imm8`, and returns 1 if the resulting mask was non-zero,
184 /// and 0 otherwise.
185 int _mm_cmpestrc(int imm8)(__m128i a, int la, __m128i b, int lb) @trusted
186 {
187     static if (GDC_with_SSE42)
188     {
189         return cast(int) __builtin_ia32_pcmpestric128(cast(ubyte16)a, la, cast(ubyte16)b, lb, imm8);
190     }
191     else static if (LDC_with_SSE42)
192     {
193         return cast(int) __builtin_ia32_pcmpestric128(cast(byte16)a, la, cast(byte16)b, lb, imm8);
194     }
195     else
196     {
197         __m128i mask = cmpstrMaskExplicit!imm8(a, la, b, lb);
198         int sigbits = _mm_movemask_epi8(mask);
199         return (sigbits != 0);
200     }
201 }
202 unittest
203 {
204     // Compare two shorter strings
205     {
206         char[16] A = "Hello world";
207         char[16] B = "Hello moon";
208         __m128i mmA = _mm_loadu_si128(cast(__m128i*)A.ptr);
209         __m128i mmB = _mm_loadu_si128(cast(__m128i*)B.ptr);
210         __m128i mask = _mm_cmpestrm!(_SIDD_UBYTE_OPS  // match gives 0 like strcmp
211                                      | _SIDD_CMP_EQUAL_EACH
212                                      | _SIDD_NEGATIVE_POLARITY)(mmA, 6, mmB, 6);
213         assert(0 == _mm_cmpestrc!(_SIDD_UBYTE_OPS  // match gives 0 like strcmp
214                                 | _SIDD_CMP_EQUAL_EACH
215                                 | _SIDD_NEGATIVE_POLARITY)(mmA, 6, mmB, 6));
216         assert(1 == _mm_cmpestrc!(_SIDD_UBYTE_OPS 
217                                 | _SIDD_CMP_EQUAL_EACH
218                                 | _SIDD_NEGATIVE_POLARITY)(mmA, 7, mmB, 7));
219     }
220 }
221 
222 /// Compare packed strings in `a` and `b` with lengths `la` and `lb` using
223 /// the control in `imm8`, and return the generated index.
224 /// Note: if the mask is all zeroes, the returned index is always `Count` 
225 /// (8 or 16 depending on size).
226 int _mm_cmpestri(int imm8)(__m128i a, int la, __m128i b, int lb) @trusted
227 {
228     static if (GDC_with_SSE42)
229     {
230         return __builtin_ia32_pcmpestri128(cast(ubyte16)a, la, cast(ubyte16)b, lb, imm8);
231     }
232     else static if (LDC_with_SSE42)
233     {
234         return __builtin_ia32_pcmpestri128(cast(byte16)a, la, cast(byte16)b, lb, imm8);
235     }
236     else
237     {
238         __m128i mask = cmpstrMaskExplicit!imm8(a, la, b, lb);
239 
240         // Convert the unit mask to bit mask
241         static if (imm8 & 1)
242         {
243             enum int Count = 8;
244             mask = _mm_packs_epi16(mask, _mm_setzero_si128());
245         }
246         else
247         {
248             enum int Count = 16;
249         }
250         int signbits = _mm_movemask_epi8(mask);
251         static if (imm8 & _SIDD_MOST_SIGNIFICANT)
252         {
253             if (signbits == 0)
254                 return Count;
255             else
256                 return bsr(signbits);
257         }
258         else
259         {
260             if (signbits == 0)
261                 return Count;
262             else
263                 return bsf(signbits);
264         }
265     }
266 }
267 unittest
268 {
269     // Find the index of the first difference (at index 6)
270     //                  v 
271     char[16] A = "Hello sun";
272     char[16] B = "Hello moon";
273 
274     __m128i mmA = _mm_loadu_si128(cast(__m128i*)A.ptr);
275     __m128i mmB = _mm_loadu_si128(cast(__m128i*)B.ptr);
276 
277     int index = _mm_cmpestri!(_SIDD_UBYTE_OPS
278                             | _SIDD_CMP_EQUAL_EACH
279                             | _SIDD_NEGATIVE_POLARITY
280                             | _SIDD_LEAST_SIGNIFICANT)(mmA, 9, mmB, 10);
281     assert(index == 6);
282 
283     // Those string must compare equal, regardless of what happens after their length.
284     index = _mm_cmpestri!(_SIDD_UBYTE_OPS
285                         | _SIDD_CMP_EQUAL_EACH
286                         | _SIDD_NEGATIVE_POLARITY
287                         | _SIDD_LEAST_SIGNIFICANT)(mmA, 6, mmB, 6); // only look first six chars
288     assert(index == 16);
289 
290     index = _mm_cmpestri!(_SIDD_UBYTE_OPS
291                         | _SIDD_CMP_EQUAL_EACH
292                         | _SIDD_NEGATIVE_POLARITY
293                         | _SIDD_MOST_SIGNIFICANT)(mmA, 6, mmB, 6); // only look first six chars
294     assert(index == 16);
295 }
296 unittest
297 {
298     // Identify the last character that isn't an identifier character.
299     //                   v (at index 7)
300     char[16] A = "my_i(en)ifie";
301     char[16] identRanges = "__azAz09";
302     __m128i mmA = _mm_loadu_si128(cast(__m128i*)A.ptr);
303     __m128i mmI = _mm_loadu_si128(cast(__m128i*)identRanges.ptr);
304     byte16 mask = cast(byte16)_mm_cmpestrm!(_SIDD_UBYTE_OPS
305                                             | _SIDD_CMP_RANGES
306                                             | _SIDD_MASKED_NEGATIVE_POLARITY
307                                             | _SIDD_UNIT_MASK)(mmI, 8, mmA, 12);
308     byte[16] correctM = [0, 0, 0, 0, -1, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0];
309     assert(mask.array == correctM);
310 
311     int index = _mm_cmpestri!(_SIDD_UBYTE_OPS
312                             | _SIDD_CMP_RANGES
313                             | _SIDD_MASKED_NEGATIVE_POLARITY
314                             | _SIDD_MOST_SIGNIFICANT)(mmI, 8, mmA, 12);
315     assert(index == 7); // ')' is the last char not to be in [__azAz09]
316 }
317 unittest
318 {
319     // testing _SIDD_CMP_RANGES but with signed shorts comparison instead (this only makes sense for _SIDD_CMP_RANGES)
320     short[8] ranges  = [0,  -1,  1000, 2000,    0,    0,    0, 0];
321     short[8] numbers = [-32768, -1000, -1, -0, 0, 1, 1000, 32767];
322     __m128i mmRanges = _mm_loadu_si128(cast(__m128i*)ranges.ptr);
323     __m128i mmNumbers = _mm_loadu_si128(cast(__m128i*)numbers.ptr);
324 
325     short8 mask = cast(short8)_mm_cmpestrm!(_SIDD_UWORD_OPS
326                                           | _SIDD_CMP_RANGES
327                                           | _SIDD_UNIT_MASK)(mmRanges, 4, mmNumbers, 8);
328     short[8] correctM = [ -1, -1, -1, -1, -1, -1, -1, -1];
329     mask = cast(short8)_mm_cmpestrm!(_SIDD_SWORD_OPS
330                                    | _SIDD_CMP_RANGES
331                                    | _SIDD_UNIT_MASK)(mmRanges, 4, mmNumbers, 8);
332     short[8] correctZ = [ 0, 0, 0, 0, 0, 0, -1, 0];
333     assert(mask.array == correctZ);
334 }
335 unittest
336 {
337     // Find a substring
338     char[16] A = "def";
339     char[16] B = "abcdefghdefff";
340     char[16] C = "no substring";
341     __m128i mmA = _mm_loadu_si128(cast(__m128i*)A.ptr);
342     __m128i mmB = _mm_loadu_si128(cast(__m128i*)B.ptr);
343     __m128i mmC = _mm_loadu_si128(cast(__m128i*)C.ptr);
344 
345     byte16 mask = cast(byte16)_mm_cmpestrm!(_SIDD_UBYTE_OPS
346                                             | _SIDD_CMP_EQUAL_ORDERED
347                                             | _SIDD_UNIT_MASK)(mmA, 3, mmB, 13);
348     byte[16] correctM = [0, 0, 0, -1, 0, 0, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0];
349     assert(mask.array == correctM);
350 
351     int firstMatch = _mm_cmpestri!(_SIDD_UBYTE_OPS
352                                  | _SIDD_CMP_EQUAL_ORDERED)(mmA, 3, mmB, 13);
353     assert(firstMatch == 3);
354 
355     int lastMatch = _mm_cmpestri!(_SIDD_UBYTE_OPS
356                                  | _SIDD_CMP_EQUAL_ORDERED
357                                  | _SIDD_MOST_SIGNIFICANT)(mmA, 3, mmB, 13);
358     assert(lastMatch == 8);
359     firstMatch = _mm_cmpestri!(_SIDD_UBYTE_OPS
360                                  | _SIDD_CMP_EQUAL_ORDERED)(mmA, -3, mmC, -12);
361     assert(firstMatch == 16); // no substring found
362 }
363 
364 /// Compare packed strings in `a` and `b` with lengths `la` and `lb` using 
365 /// the control in `imm8`, and return the generated mask.
366 __m128i _mm_cmpestrm(int imm8)(__m128i a, int la, __m128i b, int lb) @trusted
367 {
368     static if (GDC_with_SSE42)
369     {
370         return cast(__m128i) __builtin_ia32_pcmpestrm128(cast(ubyte16)a, la, cast(ubyte16)b, lb, imm8);
371     }
372     else static if (LDC_with_SSE42)
373     {
374         return cast(__m128i) __builtin_ia32_pcmpestrm128(cast(byte16)a, la, cast(byte16)b, lb, imm8);
375     }
376     else
377     {
378         __m128i mask = cmpstrMaskExplicit!imm8(a, la, b, lb);
379         
380         static if (imm8 & _SIDD_UNIT_MASK)
381         {
382             return mask;
383         }
384         else
385         {
386             // _SIDD_BIT_MASK
387             static if (imm8 & 1)
388             {
389                 mask = _mm_packs_epi16(mask, _mm_setzero_si128());
390             }
391             return _mm_cvtsi32_si128( _mm_movemask_epi8(mask));
392         }
393     }
394 }
395 unittest
396 {
397     char[16] A = "Hello world!";
398     char[16] B = "aeiou!";
399     __m128i mmA = _mm_loadu_si128(cast(__m128i*)A.ptr);
400     __m128i mmB = _mm_loadu_si128(cast(__m128i*)B.ptr);
401 
402     // Find which letters from B where found in A.
403     byte16 R = cast(byte16)_mm_cmpestrm!(_SIDD_UBYTE_OPS 
404                                        | _SIDD_CMP_EQUAL_ANY
405                                        | _SIDD_BIT_MASK)(mmA, -12, mmB, -6);
406     // because 'e', 'o', and '!' were found
407     byte[16] correctR = [42, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0];
408     assert(R.array == correctR);
409     byte16 M = cast(byte16) _mm_cmpestrm!(_SIDD_UBYTE_OPS 
410                                         | _SIDD_CMP_EQUAL_ANY
411                                         | _SIDD_UNIT_MASK)(mmA, 12, mmB, 6);
412     byte[16] correctM = [0, -1, 0, -1, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0];
413     assert(M.array == correctM);
414 }
415 
416 /// Compare packed strings in `a` and `b` with lengths `la` and `lb` using 
417 /// the control in `imm8`, and returns bit 0 of the resulting bit mask.
418 int _mm_cmpestro(int imm8)(__m128i a, int la, __m128i b, int lb) @trusted
419 {
420     static if (GDC_with_SSE42)
421     {
422         return __builtin_ia32_pcmpestrio128(cast(ubyte16)a, la, cast(ubyte16)b, lb, imm8);
423     }
424     else static if (LDC_with_SSE42)
425     {
426         return __builtin_ia32_pcmpestrio128(cast(byte16)a, la, cast(byte16)b, lb, imm8);
427     }
428     else
429     {
430         int4 mask = cast(int4) cmpstrMaskExplicit!imm8(a, la, b, lb);
431         return mask.array[0] & 1;
432     }
433 }
434 unittest
435 {
436     char[16] A = "Hallo world!";
437     char[16] B = "aeiou!";
438     __m128i mmA = _mm_loadu_si128(cast(__m128i*)A.ptr);
439     __m128i mmB = _mm_loadu_si128(cast(__m128i*)B.ptr);
440 
441     // Find which letters from B were found in A.
442     int res = _mm_cmpestro!(_SIDD_UBYTE_OPS 
443                           | _SIDD_CMP_EQUAL_ANY
444                           | _SIDD_BIT_MASK)(mmA, 12, mmB, -6);
445     // because 'a' was found in "Hallo world!"
446     assert(res == 1);
447 }
448 
449 /// Returns 1 if "any character in a was null", and 0 otherwise.
450 /// Warning: what they mean is it returns 1 if the given length `la` is < Count.
451 int _mm_cmpestrs(int imm8)(__m128i a, int la, __m128i b, int lb) @trusted
452 {
453     static if (GDC_with_SSE42)
454     {
455         return __builtin_ia32_pcmpestris128(cast(ubyte16)a, la, cast(ubyte16)b, lb, imm8);
456     }
457     else static if (LDC_with_SSE42)
458     {
459         return __builtin_ia32_pcmpestris128(cast(byte16)a, la, cast(byte16)b, lb, imm8);
460     }
461     else
462     {
463         // Yes, this intrinsic is there for symmetrical reasons and probably useless.
464         // saturates lengths (the Intrinsics Guide doesn't tell this)
465         if (la < 0) la = -la;
466         if (la > 16) la = 16;
467         enum int Count = (imm8 & 1) ? 8 : 16;
468         return (la < Count);
469     }
470 }
471 unittest
472 {
473     __m128i a;
474     a = 0;
475     assert(_mm_cmpestrs!_SIDD_UBYTE_OPS(a, 15, a, 8) == 1);
476     assert(_mm_cmpestrs!_SIDD_UBYTE_OPS(a, 16, a, 8) == 0);
477     assert(_mm_cmpestrs!_SIDD_UBYTE_OPS(a, -15, a, 8) == 1);
478     assert(_mm_cmpestrs!_SIDD_UBYTE_OPS(a, -16, a, 8) == 0);
479 }
480 
481 /// Returns 1 if "any character in b was null", and 0 otherwise.
482 /// Warning: what they mean is it returns 1 if the given length `lb` is < Count.
483 int _mm_cmpestrz(int imm8)(__m128i a, int la, __m128i b, int lb) @trusted
484 {
485     static if (GDC_with_SSE42)
486     {
487         return __builtin_ia32_pcmpestriz128(cast(ubyte16)a, la, cast(ubyte16)b, lb, imm8);
488     }
489     else static if (LDC_with_SSE42)
490     {
491         return __builtin_ia32_pcmpestriz128(cast(byte16)a, la, cast(byte16)b, lb, imm8);
492     }
493     else
494     {
495         // Yes, this intrinsic is there for symmetrical reasons and probably useless.
496         // saturates lengths (the Intrinsics Guide doesn't tell this)
497         if (lb < 0) lb = -lb;
498         if (lb > 16) lb = 16;
499         enum int Count = (imm8 & 1) ? 8 : 16;
500         return (lb < Count);
501     }
502 }
503 unittest
504 {
505     __m128i b;
506     b = 0;
507     assert(_mm_cmpestrs!_SIDD_UBYTE_OPS(b, 15, b, 15) == 1);
508     assert(_mm_cmpestrs!_SIDD_UBYTE_OPS(b, 16, b, 16) == 0);
509     assert(_mm_cmpestrs!_SIDD_UBYTE_OPS(b, -15, b, -15) == 1);
510     assert(_mm_cmpestrs!_SIDD_UBYTE_OPS(b, -16, b, -16) == 0);
511 }
512 
513 /// Compare packed signed 64-bit integers in a and b for greater-than.
514 __m128i _mm_cmpgt_epi64 (__m128i a, __m128i b) @trusted
515 {
516     long2 la = cast(long2)a;
517     long2 lb = cast(long2)b;
518     static if (GDC_with_SSE42)
519     {
520         return cast(__m128i) __builtin_ia32_pcmpgtq(la, lb);
521     }
522     else version(LDC)
523     {
524         // LDC x86: Optimized since LDC 1.1.0 -O1
525         //   arm64: Optimized since LDC 1.8.0 -O1
526         // When SSE4.2 is disabled, this gives same sequence than below.
527         return cast(__m128i)( greaterMask!long2(la, lb));
528     }
529     else
530     {        
531         long2 r;
532         r.ptr[0] = (la.array[0] > lb.array[0]) ? 0xffffffff_ffffffff : 0;
533         r.ptr[1] = (la.array[1] > lb.array[1]) ? 0xffffffff_ffffffff : 0;
534         return cast(__m128i)r;  
535     }
536 }
537 unittest
538 {
539     __m128i A = _mm_setr_epi64(-3,  2);
540     __m128i B = _mm_setr_epi64(4, -2);
541     long[2] correct = [ 0, -1 ];
542     long2 R = cast(long2)(_mm_cmpgt_epi32(A, B));
543     assert(R.array == correct);
544 }
545 
546 /// Compare packed strings with implicit lengths in `a` and `b` using the control in `imm8`,
547 /// and returns 1 if `b` did not contain a null character and the resulting mask was zero, 
548 /// and 0 otherwise.
549 int _mm_cmpistra(int imm8)(__m128i a, __m128i b) @trusted
550 {
551     static if (GDC_with_SSE42)
552     {
553         return cast(int) __builtin_ia32_pcmpistria128(cast(ubyte16)a, cast(ubyte16)b, imm8);
554     }
555     else static if (LDC_with_SSE42)
556     {
557         return __builtin_ia32_pcmpistria128(cast(byte16)a, cast(byte16)b, imm8);
558     }
559     else
560     {
561         static if (imm8 & 1)
562         {
563             int la = findLengthShort(a);
564             int lb = findLengthShort(b);
565         }
566         else
567         {
568             int la = findLengthByte(a);
569             int lb = findLengthByte(b);
570         }
571         return _mm_cmpestra!imm8(a, la, b, lb);
572     }
573 }
574 unittest
575 {
576     char[16] A = "Maximum\x00one";
577     char[16] B = "Maximum\x00four";
578     char[16] C = "Mbximum\x00length!";
579     __m128i mmA = _mm_loadu_si128(cast(__m128i*)A.ptr);
580     __m128i mmB = _mm_loadu_si128(cast(__m128i*)B.ptr);
581     __m128i mmC = _mm_loadu_si128(cast(__m128i*)C.ptr);
582 
583     // string matching a-la strcmp, for 16-bytes of data
584     // Use _SIDD_NEGATIVE_POLARITY since mask must be null, and all match must be one
585     assert(0 == _mm_cmpistra!(_SIDD_UBYTE_OPS 
586                             | _SIDD_CMP_EQUAL_EACH
587                             | _SIDD_MASKED_NEGATIVE_POLARITY)(mmA, mmB)); // match, but b is too short
588 
589     assert(0 == _mm_cmpistra!(_SIDD_UBYTE_OPS 
590                             | _SIDD_CMP_EQUAL_EACH
591                             | _SIDD_NEGATIVE_POLARITY)(mmA, mmC)); // do not match
592 }
593 
594 /// Compare packed strings with implicit lengths in `a` and `b` using the control in `imm8`,
595 /// and returns 1 if the resulting mask was non-zero, and 0 otherwise.
596 int _mm_cmpistrc(int imm8)(__m128i a, __m128i b) @trusted
597 {
598     static if (GDC_with_SSE42)
599     {
600         return cast(int) __builtin_ia32_pcmpistric128(cast(ubyte16)a, cast(ubyte16)b, imm8);
601     }
602     else static if (LDC_with_SSE42)
603     {
604         return cast(int) __builtin_ia32_pcmpistric128(cast(byte16)a, cast(byte16)b, imm8);
605     }
606     else
607     {
608         static if (imm8 & 1)
609         {
610             int la = findLengthShort(a);
611             int lb = findLengthShort(b);
612         }
613         else
614         {
615             int la = findLengthByte(a);
616             int lb = findLengthByte(b);
617         }
618         return _mm_cmpestrc!imm8(a, la, b, lb);
619     }
620 }
621 unittest
622 {
623     // Compare two shorter strings
624     {
625         char[16] A = "Hello";
626         char[16] B = "Hello moon";
627         __m128i mmA = _mm_loadu_si128(cast(__m128i*)A.ptr);
628         __m128i mmB = _mm_loadu_si128(cast(__m128i*)B.ptr);
629         assert(0 == _mm_cmpistrc!(_SIDD_UBYTE_OPS  // match gives 0 like strcmp
630                                 | _SIDD_CMP_EQUAL_EACH
631                                 | _SIDD_NEGATIVE_POLARITY)(mmA, mmA));
632         assert(1 == _mm_cmpistrc!(_SIDD_UBYTE_OPS 
633                                 | _SIDD_CMP_EQUAL_EACH
634                                 | _SIDD_NEGATIVE_POLARITY)(mmA, mmB));
635     }
636 }
637 
638 /// Compare packed strings with implicit lengths in `a` and `b` using the control in `imm8`
639 /// and return the generated index.
640 /// Note: if the mask is all zeroes, the returned index is always `Count` 
641 /// (8 or 16 depending on size).
642 int _mm_cmpistri(int imm8)(__m128i a, __m128i b) @trusted
643 {
644     static if (GDC_with_SSE42)
645     {
646         return __builtin_ia32_pcmpistri128(cast(ubyte16)a, cast(ubyte16)b, imm8);
647     }
648     else static if (LDC_with_SSE42)
649     {
650         return __builtin_ia32_pcmpistri128(cast(byte16)a, cast(byte16)b, imm8);
651     }
652     else
653     {
654         static if (imm8 & 1)
655         {
656             int la = findLengthShort(a);
657             int lb = findLengthShort(b);
658         }
659         else
660         {
661             int la = findLengthByte(a);
662             int lb = findLengthByte(b);
663         }
664         return _mm_cmpestri!imm8(a, la, b, lb);
665     }
666 }
667 unittest
668 {
669     // Identify the last character that isn't an identifier character.
670     //                   v (at index 7)
671     char[16] A = "my_i(en)ifie";
672     char[16] identRanges = "__azAz09";
673     __m128i mmA = _mm_loadu_si128(cast(__m128i*)A.ptr);
674     __m128i mmI = _mm_loadu_si128(cast(__m128i*)identRanges.ptr);
675     byte16 mask = cast(byte16)_mm_cmpistrm!(_SIDD_UBYTE_OPS
676                                             | _SIDD_CMP_RANGES
677                                             | _SIDD_MASKED_NEGATIVE_POLARITY
678                                             | _SIDD_UNIT_MASK)(mmI, mmA);
679     byte[16] correctM = [0, 0, 0, 0, -1, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0];
680     assert(mask.array == correctM);
681 
682     int index = _mm_cmpistri!(_SIDD_UBYTE_OPS
683                             | _SIDD_CMP_RANGES
684                             | _SIDD_MASKED_NEGATIVE_POLARITY
685                             | _SIDD_MOST_SIGNIFICANT)(mmI, mmA);
686     assert(index == 7); // ')' is the last char not to be in [__azAz09]
687 }
688 
689 /// Compare packed strings with implicit lengths in `a` and `b` using the control in
690 /// `imm8`, and return the generated mask.
691 __m128i _mm_cmpistrm(int imm8)(__m128i a, __m128i b) @trusted
692 {
693     static if (GDC_with_SSE42)
694     {
695         return cast(__m128i) __builtin_ia32_pcmpistrm128(cast(ubyte16)a, cast(ubyte16)b, imm8);
696     }
697     else static if (LDC_with_SSE42)
698     {
699         return cast(__m128i) __builtin_ia32_pcmpistrm128(cast(byte16)a, cast(byte16)b, imm8);
700     }
701     else
702     {
703         static if (imm8 & 1)
704         {
705             int la = findLengthShort(a);
706             int lb = findLengthShort(b);
707         }
708         else
709         {
710             int la = findLengthByte(a);
711             int lb = findLengthByte(b);
712         }
713         return _mm_cmpestrm!imm8(a, la, b, lb);
714     }
715 }
716 unittest
717 {
718     char[16] A = "Hello world!";
719     char[16] B = "aeiou!";
720     __m128i mmA = _mm_loadu_si128(cast(__m128i*)A.ptr);
721     __m128i mmB = _mm_loadu_si128(cast(__m128i*)B.ptr);
722 
723     // Find which letters from B where found in A.
724     byte16 R = cast(byte16)_mm_cmpistrm!(_SIDD_UBYTE_OPS 
725                                        | _SIDD_CMP_EQUAL_ANY
726                                        | _SIDD_BIT_MASK)(mmA, mmB);
727     // because 'e', 'o', and '!' were found
728     byte[16] correctR = [42, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0];
729     assert(R.array == correctR);
730     byte16 M = cast(byte16) _mm_cmpistrm!(_SIDD_UBYTE_OPS 
731                                         | _SIDD_CMP_EQUAL_ANY
732                                         | _SIDD_UNIT_MASK)(mmA, mmB);
733     byte[16] correctM = [0, -1, 0, -1, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0];
734     assert(M.array == correctM);
735 }
736 
737 /// Compare packed strings in `a` and `b` with lengths `la` and `lb` using 
738 /// the control in `imm8`, and returns bit 0 of the resulting bit mask.
739 int _mm_cmpistro(int imm8)(__m128i a, __m128i b) @trusted
740 {
741     static if (GDC_with_SSE42)
742     {
743         return __builtin_ia32_pcmpistrio128(cast(ubyte16)a, cast(ubyte16)b, imm8);
744     }
745     else static if (LDC_with_SSE42)
746     {
747         return __builtin_ia32_pcmpistrio128(cast(byte16)a, cast(byte16)b, imm8);
748     }
749     else
750     {
751         static if (imm8 & 1)
752         {
753             int la = findLengthShort(a);
754             int lb = findLengthShort(b);
755         }
756         else
757         {
758             int la = findLengthByte(a);
759             int lb = findLengthByte(b);
760         }
761         return _mm_cmpestro!imm8(a, la, b, lb);
762     }
763 }
764 unittest
765 {
766     char[16] A = "Hallo world!";
767     char[16] B = "aeiou!";
768     char[16] C = "Z";
769     __m128i mmA = _mm_loadu_si128(cast(__m128i*)A.ptr);
770     __m128i mmB = _mm_loadu_si128(cast(__m128i*)B.ptr);
771     __m128i mmC = _mm_loadu_si128(cast(__m128i*)C.ptr);
772 
773     // Find which letters from B where found in A.
774     int res = _mm_cmpistro!(_SIDD_UBYTE_OPS 
775                           | _SIDD_CMP_EQUAL_ANY
776                           | _SIDD_BIT_MASK)(mmA, mmB);
777     // because 'a' was found in "Hallo world!"
778     assert(res == 1);
779     res = _mm_cmpistro!(_SIDD_UBYTE_OPS 
780                       | _SIDD_CMP_EQUAL_ANY
781                       | _SIDD_BIT_MASK)(mmA, mmC);
782     assert(res == 0); // because 'Z' wasn't found in A
783 }
784 
785 /// Returns 1 if any character in `a` was null, and 0 otherwise.
786 int _mm_cmpistrs(int imm8)(__m128i a, __m128i b) @trusted
787 {
788     static if (GDC_with_SSE42)
789     {
790         return __builtin_ia32_pcmpistris128(cast(ubyte16)a, cast(ubyte16)b, imm8);
791     }
792     else static if (LDC_with_SSE42)
793     {
794         return __builtin_ia32_pcmpistris128(cast(byte16)a, cast(byte16)b, imm8);
795     }
796     else
797     {
798         static if (imm8 & 1)
799         {
800             int la = findLengthShort(a);
801             return la != 8;
802         }
803         else
804         {
805             int la = findLengthByte(a);
806             return la != 16;
807         }
808     }
809 }
810 unittest
811 {
812     char[16] A = "";
813     char[16] B = "hello";
814     char[16] C = "Maximum length!!";
815     __m128i mmA = _mm_loadu_si128(cast(__m128i*)A.ptr);
816     __m128i mmB = _mm_loadu_si128(cast(__m128i*)B.ptr);
817     __m128i mmC = _mm_loadu_si128(cast(__m128i*)C.ptr);
818     assert(_mm_cmpistrs!_SIDD_UBYTE_OPS(mmA, mmA) == 1);
819     assert(_mm_cmpistrs!_SIDD_SBYTE_OPS(mmB, mmB) == 1);
820     assert(_mm_cmpistrs!_SIDD_UWORD_OPS(mmC, mmC) == 0);
821 }
822 
823 /// Returns 1 if any character in `b` was null, and 0 otherwise.
824 int _mm_cmpistrz(int imm8)(__m128i a, __m128i b) @trusted
825 {
826     static if (GDC_with_SSE42)
827     {
828         return __builtin_ia32_pcmpistriz128(cast(ubyte16)a, cast(ubyte16)b, imm8);
829     }
830     else static if (LDC_with_SSE42)
831     {
832         return __builtin_ia32_pcmpistriz128(cast(byte16)a, cast(byte16)b, imm8);
833     }
834     else
835     {
836         static if (imm8 & 1)
837         {
838             int lb = findLengthShort(b);
839             return lb != 8;
840         }
841         else
842         {
843             int lb = findLengthByte(b);
844             return lb != 16;
845         }
846     }
847 }
848 unittest
849 {
850     char[16] A = "";
851     char[16] B = "hello";
852     char[16] C = "Maximum length!!";
853     __m128i mmA = _mm_loadu_si128(cast(__m128i*)A.ptr);
854     __m128i mmB = _mm_loadu_si128(cast(__m128i*)B.ptr);
855     __m128i mmC = _mm_loadu_si128(cast(__m128i*)C.ptr);
856     assert(_mm_cmpistrz!_SIDD_UBYTE_OPS(mmC, mmA) == 1);
857     assert(_mm_cmpistrz!_SIDD_SBYTE_OPS(mmC, mmB) == 1);
858     assert(_mm_cmpistrz!_SIDD_UWORD_OPS(mmA, mmC) == 0);
859 }
860 
861 
862 /// Starting with the initial value in `crc`, accumulates a CR32 value 
863 /// for unsigned 16-bit integer `v`.
864 /// Warning: this is computing CRC-32C (Castagnoli), not CRC-32.
865 uint _mm_crc32_u16 (uint crc, ushort v) @safe
866 {
867     static if (GDC_with_SSE42)
868     {
869         return __builtin_ia32_crc32hi(crc, v);
870     }
871     else static if (LDC_with_CRC32)
872     {
873         return __builtin_ia32_crc32hi(crc, v);
874     }
875     else static if (LDC_with_ARM64_CRC)
876     {
877         return __crc32ch(crc, v);
878     }
879     else
880     {
881         crc = _mm_crc32_u8(crc, v & 0xff);
882         crc = _mm_crc32_u8(crc, v >> 8);
883         return crc;
884     }
885 }
886 unittest
887 {
888     uint A = _mm_crc32_u16(0x12345678, 0x4512);
889     uint B = _mm_crc32_u16(0x76543210, 0xf50f);
890     uint C = _mm_crc32_u16(0xDEADBEEF, 0x0017);
891     assert(A == 0x39c3f0ff);
892     assert(B == 0xcffbcf07);
893     assert(C == 0xc7e3fe85);
894 }
895 
896 /// Starting with the initial value in `crc`, accumulates a CRC32 value 
897 /// for unsigned 32-bit integer `v`.
898 /// Warning: this is computing CRC-32C (Castagnoli), not CRC-32.
899 uint _mm_crc32_u32 (uint crc, uint v) @safe
900 {
901     static if (GDC_with_SSE42)
902     {
903         return __builtin_ia32_crc32si(crc, v);
904     }
905     else static if (LDC_with_CRC32)
906     {
907         return __builtin_ia32_crc32si(crc, v);
908     }
909     else static if (LDC_with_ARM64_CRC)
910     {
911         return __crc32cw(crc, v);
912     }
913     else
914     {
915         crc = _mm_crc32_u8(crc, v & 0xff);
916         crc = _mm_crc32_u8(crc, (v >> 8) & 0xff);
917         crc = _mm_crc32_u8(crc, (v >> 16) & 0xff);
918         crc = _mm_crc32_u8(crc, (v >> 24) & 0xff);
919         return crc;
920     }
921 }
922 unittest
923 {
924     uint A = _mm_crc32_u32(0x12345678, 0x45123563);
925     uint B = _mm_crc32_u32(0x76543210, 0xf50f9993);
926     uint C = _mm_crc32_u32(0xDEADBEEF, 0x00170017);
927     assert(A == 0x22a6ec54);
928     assert(B == 0x7019a6cf);
929     assert(C == 0xbc552c27);
930 }
931 
932 /// Starting with the initial value in `crc`, accumulates a CRC32 
933 /// value for unsigned 64-bit integer `v`.
934 /// Warning: this is computing CRC-32C (Castagnoli), not CRC-32.
935 ulong _mm_crc32_u64 (ulong crc, ulong v)
936 {
937     version(X86_64)
938         enum bool hasX86Intrin = GDC_with_SSE42 || LDC_with_CRC32;
939     else
940         enum bool hasX86Intrin = false; // intrinsics not available in 32-bit
941 
942     static if (hasX86Intrin)
943     {
944         return __builtin_ia32_crc32di(crc, v);
945     }
946     else static if (LDC_with_ARM64_CRC)
947     {
948         return __crc32cd(cast(uint)crc, v);
949     }
950     else
951     {
952         uint crc32 = cast(uint)crc;
953         crc32 = _mm_crc32_u8(crc32, (v >> 0) & 0xff);
954         crc32 = _mm_crc32_u8(crc32, (v >> 8) & 0xff);
955         crc32 = _mm_crc32_u8(crc32, (v >> 16) & 0xff);
956         crc32 = _mm_crc32_u8(crc32, (v >> 24) & 0xff);
957         crc32 = _mm_crc32_u8(crc32, (v >> 32) & 0xff);
958         crc32 = _mm_crc32_u8(crc32, (v >> 40) & 0xff);
959         crc32 = _mm_crc32_u8(crc32, (v >> 48) & 0xff);
960         crc32 = _mm_crc32_u8(crc32, (v >> 56) & 0xff);
961         return crc32;
962     }
963 }
964 unittest
965 {
966     ulong A = _mm_crc32_u64(0x1234567812345678, 0x39C3F0FFCFFBCF07);
967     ulong B = _mm_crc32_u64(0x7654321001234567, 0xFACEFEED);
968     ulong C = _mm_crc32_u64(0xDEADBEEFCAFEBABE, 0x0017C7E3FE850017);
969     assert(A == 0xd66b1074);
970     assert(B == 0xac12f9c6);
971     assert(C == 0xa2d13dd8);
972 }
973 
974 /// Starting with the initial value in `crc`, accumulates a CRC32 value 
975 /// for unsigned 8-bit integer `v`.
976 /// Warning: this is computing CRC-32C (Castagnoli), not CRC-32.
977 uint _mm_crc32_u8 (uint crc, ubyte v) @safe
978 {
979     static if (GDC_with_SSE42)
980     {
981         return __builtin_ia32_crc32qi(crc, v);
982     }
983     else static if (LDC_with_CRC32)
984     {
985         return __builtin_ia32_crc32qi(crc, v);
986     }
987     else static if (LDC_with_ARM64_CRC)
988     {
989         return __crc32cb(crc, v);
990     }
991     else
992     {
993         return CRC32cTable[(crc ^ v) & 0xFF] ^ (crc >> 8); 
994     }
995 }
996 unittest
997 {
998     uint A = _mm_crc32_u8(0x12345678, 0x45);
999     uint B = _mm_crc32_u8(0x76543210, 0xf5);
1000     uint C = _mm_crc32_u8(0xDEADBEEF, 0x00);
1001     assert(A == 0x8fd93134);
1002     assert(B == 0xd6b7e834);
1003     assert(C == 0xbdfd3980);
1004 }
1005 
1006 
1007 // Utilities for this file
1008 
1009 private:
1010 
1011 static if (GDC_with_SSE42)
1012 {
1013     version(X86_64)
1014         enum bool NeedCRC32CTable = false;
1015     else
1016         enum bool NeedCRC32CTable = true;
1017 }
1018 else static if (LDC_with_CRC32)
1019 {
1020     version(X86_64)
1021         enum bool NeedCRC32CTable = false;
1022     else
1023         enum bool NeedCRC32CTable = true;
1024 }
1025 else static if (LDC_with_ARM64_CRC)
1026 {
1027     enum bool NeedCRC32CTable = false;
1028 }
1029 else
1030 {
1031     enum bool NeedCRC32CTable = true;
1032 }
1033 
1034 static if (NeedCRC32CTable)
1035 {
1036     static immutable uint[256] CRC32cTable =
1037     [
1038         0x0, 0xf26b8303, 0xe13b70f7, 0x1350f3f4, 0xc79a971f, 0x35f1141c, 0x26a1e7e8, 0xd4ca64eb,
1039         0x8ad958cf, 0x78b2dbcc, 0x6be22838, 0x9989ab3b, 0x4d43cfd0, 0xbf284cd3, 0xac78bf27, 0x5e133c24,
1040         0x105ec76f, 0xe235446c, 0xf165b798, 0x30e349b, 0xd7c45070, 0x25afd373, 0x36ff2087, 0xc494a384,
1041         0x9a879fa0, 0x68ec1ca3, 0x7bbcef57, 0x89d76c54, 0x5d1d08bf, 0xaf768bbc, 0xbc267848, 0x4e4dfb4b,
1042         0x20bd8ede, 0xd2d60ddd, 0xc186fe29, 0x33ed7d2a, 0xe72719c1, 0x154c9ac2, 0x61c6936, 0xf477ea35,
1043         0xaa64d611, 0x580f5512, 0x4b5fa6e6, 0xb93425e5, 0x6dfe410e, 0x9f95c20d, 0x8cc531f9, 0x7eaeb2fa,
1044         0x30e349b1, 0xc288cab2, 0xd1d83946, 0x23b3ba45, 0xf779deae, 0x5125dad, 0x1642ae59, 0xe4292d5a,
1045         0xba3a117e, 0x4851927d, 0x5b016189, 0xa96ae28a, 0x7da08661, 0x8fcb0562, 0x9c9bf696, 0x6ef07595,
1046         0x417b1dbc, 0xb3109ebf, 0xa0406d4b, 0x522bee48, 0x86e18aa3, 0x748a09a0, 0x67dafa54, 0x95b17957,
1047         0xcba24573, 0x39c9c670, 0x2a993584, 0xd8f2b687, 0xc38d26c, 0xfe53516f, 0xed03a29b, 0x1f682198,
1048         0x5125dad3, 0xa34e59d0, 0xb01eaa24, 0x42752927, 0x96bf4dcc, 0x64d4cecf, 0x77843d3b, 0x85efbe38,
1049         0xdbfc821c, 0x2997011f, 0x3ac7f2eb, 0xc8ac71e8, 0x1c661503, 0xee0d9600, 0xfd5d65f4, 0xf36e6f7,
1050         0x61c69362, 0x93ad1061, 0x80fde395, 0x72966096, 0xa65c047d, 0x5437877e, 0x4767748a, 0xb50cf789,
1051         0xeb1fcbad, 0x197448ae, 0xa24bb5a, 0xf84f3859, 0x2c855cb2, 0xdeeedfb1, 0xcdbe2c45, 0x3fd5af46,
1052         0x7198540d, 0x83f3d70e, 0x90a324fa, 0x62c8a7f9, 0xb602c312, 0x44694011, 0x5739b3e5, 0xa55230e6,
1053         0xfb410cc2, 0x92a8fc1, 0x1a7a7c35, 0xe811ff36, 0x3cdb9bdd, 0xceb018de, 0xdde0eb2a, 0x2f8b6829,
1054         0x82f63b78, 0x709db87b, 0x63cd4b8f, 0x91a6c88c, 0x456cac67, 0xb7072f64, 0xa457dc90, 0x563c5f93,
1055         0x82f63b7, 0xfa44e0b4, 0xe9141340, 0x1b7f9043, 0xcfb5f4a8, 0x3dde77ab, 0x2e8e845f, 0xdce5075c,
1056         0x92a8fc17, 0x60c37f14, 0x73938ce0, 0x81f80fe3, 0x55326b08, 0xa759e80b, 0xb4091bff, 0x466298fc,
1057         0x1871a4d8, 0xea1a27db, 0xf94ad42f, 0xb21572c, 0xdfeb33c7, 0x2d80b0c4, 0x3ed04330, 0xccbbc033,
1058         0xa24bb5a6, 0x502036a5, 0x4370c551, 0xb11b4652, 0x65d122b9, 0x97baa1ba, 0x84ea524e, 0x7681d14d,
1059         0x2892ed69, 0xdaf96e6a, 0xc9a99d9e, 0x3bc21e9d, 0xef087a76, 0x1d63f975, 0xe330a81, 0xfc588982,
1060         0xb21572c9, 0x407ef1ca, 0x532e023e, 0xa145813d, 0x758fe5d6, 0x87e466d5, 0x94b49521, 0x66df1622,
1061         0x38cc2a06, 0xcaa7a905, 0xd9f75af1, 0x2b9cd9f2, 0xff56bd19, 0xd3d3e1a, 0x1e6dcdee, 0xec064eed,
1062         0xc38d26c4, 0x31e6a5c7, 0x22b65633, 0xd0ddd530, 0x417b1db, 0xf67c32d8, 0xe52cc12c, 0x1747422f,
1063         0x49547e0b, 0xbb3ffd08, 0xa86f0efc, 0x5a048dff, 0x8ecee914, 0x7ca56a17, 0x6ff599e3, 0x9d9e1ae0,
1064         0xd3d3e1ab, 0x21b862a8, 0x32e8915c, 0xc083125f, 0x144976b4, 0xe622f5b7, 0xf5720643, 0x7198540,
1065         0x590ab964, 0xab613a67, 0xb831c993, 0x4a5a4a90, 0x9e902e7b, 0x6cfbad78, 0x7fab5e8c, 0x8dc0dd8f,
1066         0xe330a81a, 0x115b2b19, 0x20bd8ed, 0xf0605bee, 0x24aa3f05, 0xd6c1bc06, 0xc5914ff2, 0x37faccf1,
1067         0x69e9f0d5, 0x9b8273d6, 0x88d28022, 0x7ab90321, 0xae7367ca, 0x5c18e4c9, 0x4f48173d, 0xbd23943e,
1068         0xf36e6f75, 0x105ec76, 0x12551f82, 0xe03e9c81, 0x34f4f86a, 0xc69f7b69, 0xd5cf889d, 0x27a40b9e,
1069         0x79b737ba, 0x8bdcb4b9, 0x988c474d, 0x6ae7c44e, 0xbe2da0a5, 0x4c4623a6, 0x5f16d052, 0xad7d5351,
1070     ];
1071 }
1072 
1073 int findLengthByte(__m128i a) pure @safe
1074 {
1075     const __m128i zero = _mm_setzero_si128();
1076     const __m128i zeroMask = _mm_cmpeq_epi8(a, zero); // 0xff where a byte is zero
1077     int mask = _mm_movemask_epi8(zeroMask); // the lowest set bit is the zero index
1078     if (mask == 0)
1079         return 16;
1080     else
1081         return bsf(mask);
1082 }
1083 unittest
1084 {
1085     char[16] A = "Hel!o";
1086     char[16] B = "Maximum length!!";
1087     __m128i mmA = _mm_loadu_si128(cast(__m128i*)A.ptr);
1088     __m128i mmB = _mm_loadu_si128(cast(__m128i*)B.ptr);
1089     assert(findLengthByte(mmA) == 5);
1090     assert(findLengthByte(mmB) == 16);
1091 }
1092 
1093 int findLengthShort(__m128i a) pure @safe
1094 {
1095     const __m128i zero = _mm_setzero_si128();
1096     const __m128i zeroMask = _mm_cmpeq_epi16(a, zero); // 0xffff where a short is zero
1097     int mask = _mm_movemask_epi8(zeroMask); // the lowest set bit is the zero index
1098     if (mask == 0)
1099         return 8;
1100     else
1101         return bsf(mask) >> 1;
1102 }
1103 unittest
1104 {
1105     short[8] A = [10, 5423, 475, 0, 1, 1, 1, 1 ];
1106     short[8] B = [-1, -2, -3, 4, 5, 6, -32768, 1];
1107     __m128i mmA = _mm_loadu_si128(cast(__m128i*)A.ptr);
1108     __m128i mmB = _mm_loadu_si128(cast(__m128i*)B.ptr);
1109     assert(findLengthShort(mmA) == 3);
1110     assert(findLengthShort(mmB) == 8);
1111 }
1112 
1113 static immutable byte[32] MASK_DATA =
1114 [
1115     -1, -1, -1, -1, -1, -1, -1, -1,
1116     -1, -1, -1, -1, -1, -1, -1, -1,
1117      0,  0,  0,  0,  0,  0,  0,  0,
1118      0,  0,  0,  0,  0,  0,  0,  0,
1119 ];
1120 
1121 // Makes a byte validity mask with a given explicit length string.
1122 __m128i validMask8e(int len) @trusted
1123 {
1124     return _mm_loadu_si128(cast(__m128i*) &MASK_DATA[16-len]);
1125 }
1126 unittest
1127 {
1128     char[16] A = "";
1129     char[16] B = "0123456789abcdef";
1130     byte[16] correctA = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0];
1131     byte[16] correctB = [-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1];
1132     byte16 MA = cast(byte16) validMask8e(0);
1133     byte16 MB = cast(byte16) validMask8e(16);
1134     assert(MA.array == correctA);
1135     assert(MB.array == correctB);
1136 }
1137 
1138 // Makes a short validity mask with a given explicit length string.
1139 __m128i validMask16e(int len) @trusted
1140 {
1141     return _mm_loadu_si128(cast(__m128i*) &MASK_DATA[16-len*2]);
1142 }
1143 unittest
1144 {
1145     short[8] A = [3, 4, 5, 0, 3, 4, 5, 6];
1146     short[8] correctA = [-1, -1, -1, 0, 0, 0, 0, 0];
1147     short8 MA = cast(short8) validMask16e(3);
1148     assert(MA.array == correctA);
1149 }
1150 
1151 // Internal implementation for non-SSE4.2
1152 // Compare 8-bit or 16-bit strings, get a mask.
1153 // `aValid` and `bValid` are byte-mask or word-mask of the valid
1154 // zone in `a` and `b`.
1155 __m128i cmpstrMaskExplicit(int imm8)(__m128i a, 
1156                                      ref int la, 
1157                                      __m128i b, 
1158                                      ref int lb) @safe
1159 {
1160     // saturates lengths (the Intrinsics Guide doesn't tell this)
1161     if (la < 0) la = -la;
1162     if (lb < 0) lb = -lb;
1163     if (la > 16) la = 16;
1164     if (lb > 16) lb = 16;
1165 
1166     static if (imm8 & 1)
1167     {
1168         __m128i aValid = validMask16e(la);
1169         __m128i bValid = validMask16e(lb);
1170     }
1171     else
1172     {
1173         __m128i aValid = validMask8e(la);
1174         __m128i bValid = validMask8e(lb);
1175     }
1176     return cmpstrMask!imm8(a, aValid, b, bValid);
1177 }
1178 
1179 //ditto
1180 __m128i cmpstrMask(int imm8)(__m128i a, 
1181                              __m128i aValid, 
1182                              __m128i b, 
1183                              const __m128i bValid) @safe
1184 {
1185     enum bool chars16Bits = imm8 & 1;
1186     enum int Mode = (imm8 >> 2) & 3;
1187 
1188     static if (Mode == 0) // equal any
1189     {
1190         __m128i R = _mm_setzero_si128();
1191         static if (chars16Bits) // 64 comparisons
1192         {
1193             for (int k = 0; k < 8; ++k)
1194             {
1195                 __m128i equalMask = _mm_cmpeq_epi16(a, b);
1196                 equalMask = _mm_and_si128(equalMask, aValid);
1197                 R = _mm_or_si128(R, equalMask);
1198 
1199                 // rotate a and aValid
1200                 a = _mm_or_si128(_mm_srli_si128!2(a), _mm_slli_si128!14(a));
1201                 aValid = _mm_or_si128(_mm_srli_si128!2(aValid), _mm_slli_si128!14(aValid));
1202             }
1203         }
1204         else
1205         {
1206             for (int k = 0; k < 16; ++k)
1207             {
1208                 __m128i equalMask = _mm_cmpeq_epi8(a, b);
1209                 equalMask = _mm_and_si128(equalMask, aValid);
1210                 R = _mm_or_si128(R, equalMask);
1211 
1212                 // rotate a and aValid
1213                 a = _mm_or_si128(_mm_srli_si128!1(a), _mm_slli_si128!15(a));
1214                 aValid = _mm_or_si128(_mm_srli_si128!1(aValid), _mm_slli_si128!15(aValid));
1215             }
1216         }
1217         R = _mm_and_si128(R, bValid);
1218     }
1219     else static if (Mode == 1) // ranges
1220     {
1221         enum bool signed = (imm8 & 2) != 0;
1222 
1223         // For each character in b, the returned mask says if it was found in a range-pair in `a`.
1224         __m128i R = _mm_setzero_si128();
1225         static if (chars16Bits)
1226         {
1227             for (int pos = 0; pos < 8; pos += 2)
1228             {
1229                 short min = (cast(short8)a).array[pos];
1230                 short max = (cast(short8)a).array[pos+1];
1231                 static if (signed)
1232                 {
1233                     __m128i ge = ~_mm_cmplt_epi16(b, _mm_set1_epi16(min));
1234                     __m128i le = ~_mm_cmpgt_epi16(b, _mm_set1_epi16(max));
1235                 }
1236                 else
1237                 {
1238                     // No SSE way to do 16-bit unsigned comparisons, 
1239                     // but flipping the sign bit let us used signed comp
1240                     __m128i firstBits = _mm_set1_epi16(-32768);
1241                     __m128i reverseB = _mm_xor_si128(b, firstBits);
1242                     __m128i reverseMin = _mm_xor_si128(_mm_set1_epi16(min), firstBits);
1243                     __m128i reverseMax = _mm_xor_si128(_mm_set1_epi16(max), firstBits);
1244                     __m128i ge = ~_mm_cmplt_epi16(reverseB, reverseMin);
1245                     __m128i le = ~_mm_cmpgt_epi16(reverseB, reverseMax);
1246                 }
1247                 __m128i inRange = _mm_and_si128(le, ge);
1248 
1249                 // Not considered in range a is invalid here.
1250                 short aValidHere = (cast(short8)aValid).array[pos+1];
1251                 __m128i mmAValidHere = _mm_set1_epi16(aValidHere);
1252                 inRange = _mm_and_si128(inRange, mmAValidHere); 
1253 
1254                 R = _mm_or_si128(R, inRange);
1255             }            
1256         }
1257         else // 8-bits
1258         {
1259             for (int pos = 0; pos < 16; pos += 2)
1260             {
1261                 byte min = (cast(byte16)a).array[pos];
1262                 byte max = (cast(byte16)a).array[pos+1];
1263                 static if (signed)
1264                 {
1265                     __m128i ge = _mm_xor_si128(_mm_cmplt_epi8(b, _mm_set1_epi8(min)));
1266                     __m128i le = _mm_xor_si128(_mm_cmpgt_epi8(b, _mm_set1_epi8(max)));
1267                 }
1268                 else
1269                 {
1270                     // No SSE way to do 16-bit unsigned comparisons, 
1271                     // but flipping the sign bit let us used signed comp
1272                     __m128i firstBits = _mm_set1_epi8(-128);
1273                     __m128i reverseB = _mm_xor_si128(b, firstBits);
1274                     __m128i reverseMin = _mm_xor_si128(_mm_set1_epi8(min), firstBits);
1275                     __m128i reverseMax = _mm_xor_si128(_mm_set1_epi8(max), firstBits);
1276                     __m128i ge = ~_mm_cmplt_epi8(reverseB, reverseMin);
1277                     __m128i le = ~_mm_cmpgt_epi8(reverseB, reverseMax);
1278                 }
1279                 __m128i inRange = _mm_and_si128(le, ge);
1280 
1281                 // Not considered in range a is invalid here.
1282                 byte aValidHere = (cast(byte16)aValid).array[pos+1];
1283                 __m128i mmAValidHere = _mm_set1_epi8(aValidHere);
1284                 inRange = _mm_and_si128(inRange, mmAValidHere); 
1285 
1286                 R = _mm_or_si128(R, inRange);
1287             }
1288         }
1289         // invalid b part is not in range
1290         R = _mm_and_si128(R, bValid);
1291     }
1292     else static if (Mode == 2) // equal each, just 16 comparisons not 256
1293     {
1294         static if (chars16Bits)
1295         {
1296             __m128i R = _mm_cmpeq_epi16(a, b);
1297         }
1298         else
1299         {
1300             __m128i R = _mm_cmpeq_epi8(a, b);
1301         }
1302 
1303         // if only a or b is invalid, consider not equal
1304         R = _mm_andnot_si128(_mm_xor_si128(aValid, bValid), R);
1305 
1306         // if a and b are both invalid, consider equal
1307         R = _mm_or_si128(R, ~_mm_or_si128(aValid, bValid));
1308     }  
1309     else static if (Mode == 3) // equal ordered
1310     {
1311         // a is searched in b.
1312 
1313         __m128i bValidShift = bValid;
1314 
1315         __m128i R = _mm_set1_epi32(-1); // all b positions possible for containing a
1316         static if (chars16Bits)
1317         {
1318             for (int pos = 0; pos < 8; ++pos)
1319             {
1320                 // compare character k of a, where can it go in b?
1321                 short charK = (cast(short8)a).array[pos];
1322                 __m128i mmcharK = _mm_set1_epi16(charK);
1323 
1324                 short aValidHere = (cast(short8)aValid).array[pos];
1325                 __m128i mmAValidHere = _mm_set1_epi16(aValidHere);
1326                 __m128i mmAInvalidHere = _mm_xor_si128(mmAValidHere, _mm_set1_epi32(-1));
1327                 __m128i equalMask = _mm_cmpeq_epi16(mmcharK, b);
1328 
1329                 // Where A is invalid, the comparison always holds "equal"
1330                 equalMask = _mm_or_si128(equalMask, mmAInvalidHere);
1331 
1332                 // Where B is invalid, and A is valid, the comparison is forced to false
1333                 equalMask = _mm_and_si128(equalMask, _mm_or_si128(bValidShift, mmAInvalidHere));
1334 
1335                 R = _mm_and_si128(equalMask);
1336 
1337                 // drop first char of b
1338                 b = _mm_srli_si128!2(b);
1339                 bValidShift = _mm_srli_si128!2(bValidShift);
1340             }
1341         }
1342         else
1343         {
1344             for (int pos = 0; pos < 16; ++pos)
1345             {
1346                 // compare character k of a, where can it go in b?
1347                 byte charK = (cast(byte16)a).array[pos];
1348                 __m128i mmcharK = _mm_set1_epi8(charK);
1349 
1350                 byte aValidHere = (cast(byte16)aValid).array[pos];            
1351                 __m128i mmAValidHere = _mm_set1_epi8(aValidHere);
1352                 __m128i mmAInvalidHere = _mm_xor_si128(mmAValidHere, _mm_set1_epi32(-1));
1353                 __m128i equalMask = _mm_cmpeq_epi8(mmcharK, b);
1354 
1355                 // Where A is invalid, the comparison always holds "equal"
1356                 equalMask = _mm_or_si128(equalMask, mmAInvalidHere);
1357 
1358                 // Where B is invalid, and A is valid, the comparison is forced to false
1359                 equalMask = _mm_and_si128(equalMask, _mm_or_si128(bValidShift, mmAInvalidHere));
1360 
1361                 R = _mm_and_si128(R, equalMask);
1362 
1363                 // drop first char of b
1364                 b = _mm_srli_si128!1(b);
1365                 bValidShift = _mm_srli_si128!1(bValidShift);
1366             }
1367         }
1368     }
1369     else 
1370         static assert(0);
1371 
1372     // Optionally negate result
1373     static if (imm8 & _SIDD_NEGATIVE_POLARITY)
1374     {
1375         static if (imm8 & _SIDD_MASKED_POSITIVE_POLARITY) 
1376         {
1377             R = _mm_xor_si128(R, bValid); // only negate valid b
1378         }
1379         else
1380         {
1381             R = _mm_xor_si128(R, _mm_set1_epi32(-1)); // negate all
1382         }
1383     }
1384     return R;
1385 }