1 /**
2 * SSE4.2 intrinsics.
3 * https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#techs=SSSE3
4 *
5 * Copyright: Guillaume Piolat 2022.
6 * License:   $(LINK2 http://www.boost.org/LICENSE_1_0.txt, Boost License 1.0)
7 */
8 module inteli.nmmintrin;
9 
10 public import inteli.types;
11 import inteli.internals;
12 public import inteli.smmintrin;
13 import core.bitop: bsf, bsr;
14 
15 
16 // Note: this header will work whether you have SSE4.2 enabled or not.
17 // With LDC, use "dflags-ldc": ["-mattr=+sse4.2"] or equivalent to actively 
18 // generate SSE4.2 instruction (they are often enabled with -O1 or greater).
19 // Additionally, you need ["-mattr=+crc"] on ARM if you want hardware CRC instructions.
20 // With GDC, use "dflags-gdc": ["-msse4.2"] or equivalent to generate SSE4.2 instructions.
21 
22 nothrow @nogc:
23 
24 // <Data size and signedness>
25 
26 /// String contains unsigned 8-bit characters (default).
27 enum int _SIDD_UBYTE_OPS = 0;
28 
29 /// String contains unsigned 16-bit characters.
30 enum int _SIDD_UWORD_OPS = 1;
31 
32 /// String contains signed 8-bit characters.
33 enum int _SIDD_SBYTE_OPS = 2;
34 
35 /// String contains signed 16-bit characters.
36 enum int _SIDD_SWORD_OPS = 3;
37 
38 // </Data size and signedness>
39 
40 
41 // <Comparison options>
42 
43 /// For each character in `b`, find if it is in `a` (default)
44 /// The resulting mask has bit set at b positions that were found in a.
45 enum int _SIDD_CMP_EQUAL_ANY = 0;
46 
47 /// For each character in `b`, determine if
48 /// `a[0] <= c <= a[1] or a[1] <= c <= a[2]...`
49 /// Contrarily to false documentation on the Internet, pairs must be in `a`!
50 enum int _SIDD_CMP_RANGES = 4;
51 
52 /// The strings defined by `a` and `b` are equal
53 enum int _SIDD_CMP_EQUAL_EACH = 8;
54 
55 /// Search for the defined substring in the target
56 enum int _SIDD_CMP_EQUAL_ORDERED = 12;
57 
58 // </Comparison options>
59 
60 // <Result polarity>
61 
62 /// Do not negate results (default, no effect)
63 enum int _SIDD_POSITIVE_POLARITY = 0;
64 
65 /// Negates results
66 enum int _SIDD_NEGATIVE_POLARITY = 16;
67 
68 /// No effect. Do not negate results before the end of the string. (default when using `_SIDD_NEGATIVE_POLARITY`)
69 /// You basically never want this.
70 enum int _SIDD_MASKED_POSITIVE_POLARITY = 32;
71 
72 /// Negates results only before the end of the string
73 enum int _SIDD_MASKED_NEGATIVE_POLARITY = 48;
74 
75 // </Result polarity>
76 
77 // <Bit returned>
78 
79 /// **Index only**: return the least significant bit (default).
80 enum int _SIDD_LEAST_SIGNIFICANT = 0;
81 
82 /// **Index only**: return the most significant bit.
83 enum int _SIDD_MOST_SIGNIFICANT = 64;
84 
85 // </Bit returned>
86 
87 /// **Mask only**: return the bit mask (default).
88 enum int _SIDD_BIT_MASK = 0;
89 
90 /// **Mask only**: return the byte/word mask.
91 enum int _SIDD_UNIT_MASK = 64;
92 
93 /// So SSE4.2 has a lot of hard-to-understand instructions. Here is another explanations.
94 ///
95 /// Alternative explanation of imm8
96 ///
97 /// imm8 is an 8-bit immediate operand specifying whether the characters are bytes or
98 ///    words and the type of comparison to do.
99 ///
100 ///    Bits [1:0]: Determine source data format.
101 ///      00: 16 unsigned bytes
102 ///      01: 8 unsigned words
103 ///      10: 16 signed bytes
104 ///      11: 8 signed words
105 ///
106 ///    Bits [3:2]: Determine comparison type and aggregation method.
107 ///      00: Subset: Each character in B is compared for equality with all
108 ///          the characters in A.
109 ///      01: Ranges: Each character in B is compared to A pairs. The comparison
110 ///          basis is greater than or equal for even-indexed elements in A,
111 ///          and less than or equal for odd-indexed elements in A.
112 ///      10: Match: Compare each pair of corresponding characters in A and
113 ///          B for equality.
114 ///      11: Substring: Search B for substring matches of A.
115 ///
116 ///    Bits [5:4]: Determine whether to do a one's complement on the bit
117 ///                mask of the comparison results. \n
118 ///      00: No effect. \n
119 ///      01: Negate the bit mask. \n
120 ///      10: No effect. \n
121 ///      11: Negate the bit mask only for bits with an index less than or equal
122 ///          to the size of \a A or \a B.
123 ///
124 
125 
126 
127 /// Compare packed strings in `a` and `b` with lengths `la` and `lb` using 
128 /// the control in `imm8`, and returns 1 if `b` "does not contain a null character"
129 /// and the resulting mask was zero, and 0 otherwise.
130 /// Warning: actually it seems the instruction does accept \0 in input, just the length must be >= count.
131 ///          It's not clear for what purpose.
132 int _mm_cmpestra(int imm8)(__m128i a, int la, __m128i b, int lb) @trusted
133 {
134     static if (GDC_with_SSE42)
135     {
136         return cast(int) __builtin_ia32_pcmpestria128(cast(ubyte16)a, la, cast(ubyte16)b, lb, imm8);
137     }
138     else static if (LDC_with_SSE42)
139     {
140         return __builtin_ia32_pcmpestria128(cast(byte16)a, la, cast(byte16)b, lb, imm8);
141     }
142     else
143     {
144         __m128i mask = cmpstrMaskExplicit!imm8(a, la, b, lb);
145         __m128i equalZero = _mm_cmpeq_epi8(mask, _mm_setzero_si128());
146         int sigbits = _mm_movemask_epi8(equalZero);
147         enum int Count = (imm8 & 1) ? 8 : 16;
148         return (sigbits == 0xffff) && (lb >= Count);
149     }
150 }
151 unittest
152 {
153     char[16] A = "Maximum\x00length!!";
154     char[16] B = "Mbximum\x00length!!";
155     __m128i mmA = _mm_loadu_si128(cast(__m128i*)A.ptr);
156     __m128i mmB = _mm_loadu_si128(cast(__m128i*)B.ptr);
157 
158     // string matching a-la strcmp, for 16-bytes of data
159     // Use _SIDD_NEGATIVE_POLARITY since mask must be null, and all match must be one
160     assert(1 == _mm_cmpestra!(_SIDD_UBYTE_OPS 
161                             | _SIDD_CMP_EQUAL_EACH
162                             | _SIDD_NEGATIVE_POLARITY)(mmA, 16, mmA, 16));
163     assert(0 == _mm_cmpestra!(_SIDD_UBYTE_OPS 
164                             | _SIDD_CMP_EQUAL_EACH
165                             | _SIDD_NEGATIVE_POLARITY)(mmA, 16, mmB, 16));
166 
167     // test negative length, this will be clamped to 16
168     assert(1 == _mm_cmpestra!(_SIDD_UBYTE_OPS 
169                             | _SIDD_CMP_EQUAL_EACH
170                             | _SIDD_NEGATIVE_POLARITY)(mmA, -160, mmA, -17));
171 
172     // it seems you can't compare shorter strings for equality using _mm_cmpestra (!)
173 
174     // Test 16-bit format
175     assert(1 == _mm_cmpestra!(_SIDD_SWORD_OPS 
176                             | _SIDD_CMP_EQUAL_EACH
177                             | _SIDD_NEGATIVE_POLARITY)(mmA, 8, mmA, 8));
178 }
179 
180 /// Compare packed strings in `a` and `b` with lengths `la` and `lb` using 
181 /// the control in `imm8`, and returns 1 if the resulting mask was non-zero,
182 /// and 0 otherwise.
183 int _mm_cmpestrc(int imm8)(__m128i a, int la, __m128i b, int lb) @trusted
184 {
185     static if (GDC_with_SSE42)
186     {
187         return cast(int) __builtin_ia32_pcmpestric128(cast(ubyte16)a, la, cast(ubyte16)b, lb, imm8);
188     }
189     else static if (LDC_with_SSE42)
190     {
191         return cast(int) __builtin_ia32_pcmpestric128(cast(byte16)a, la, cast(byte16)b, lb, imm8);
192     }
193     else
194     {
195         __m128i mask = cmpstrMaskExplicit!imm8(a, la, b, lb);
196         int sigbits = _mm_movemask_epi8(mask);
197         return (sigbits != 0);
198     }
199 }
200 unittest
201 {
202     // Compare two shorter strings
203     {
204         char[16] A = "Hello world";
205         char[16] B = "Hello moon";
206         __m128i mmA = _mm_loadu_si128(cast(__m128i*)A.ptr);
207         __m128i mmB = _mm_loadu_si128(cast(__m128i*)B.ptr);
208         __m128i mask = _mm_cmpestrm!(_SIDD_UBYTE_OPS  // match gives 0 like strcmp
209                                      | _SIDD_CMP_EQUAL_EACH
210                                      | _SIDD_NEGATIVE_POLARITY)(mmA, 6, mmB, 6);
211         assert(0 == _mm_cmpestrc!(_SIDD_UBYTE_OPS  // match gives 0 like strcmp
212                                 | _SIDD_CMP_EQUAL_EACH
213                                 | _SIDD_NEGATIVE_POLARITY)(mmA, 6, mmB, 6));
214         assert(1 == _mm_cmpestrc!(_SIDD_UBYTE_OPS 
215                                 | _SIDD_CMP_EQUAL_EACH
216                                 | _SIDD_NEGATIVE_POLARITY)(mmA, 7, mmB, 7));
217     }
218 }
219 
220 /// Compare packed strings in `a` and `b` with lengths `la` and `lb` using
221 /// the control in `imm8`, and return the generated index.
222 /// Note: if the mask is all zeroes, the returned index is always `Count` 
223 /// (8 or 16 depending on size).
224 int _mm_cmpestri(int imm8)(__m128i a, int la, __m128i b, int lb) @trusted
225 {
226     static if (GDC_with_SSE42)
227     {
228         return __builtin_ia32_pcmpestri128(cast(ubyte16)a, la, cast(ubyte16)b, lb, imm8);
229     }
230     else static if (LDC_with_SSE42)
231     {
232         return __builtin_ia32_pcmpestri128(cast(byte16)a, la, cast(byte16)b, lb, imm8);
233     }
234     else
235     {
236         __m128i mask = cmpstrMaskExplicit!imm8(a, la, b, lb);
237 
238         // Convert the unit mask to bit mask
239         static if (imm8 & 1)
240         {
241             enum int Count = 8;
242             mask = _mm_packs_epi16(mask, _mm_setzero_si128());
243         }
244         else
245         {
246             enum int Count = 16;
247         }
248         int signbits = _mm_movemask_epi8(mask);
249         static if (imm8 & _SIDD_MOST_SIGNIFICANT)
250         {
251             if (signbits == 0)
252                 return Count;
253             else
254                 return bsr(signbits);
255         }
256         else
257         {
258             if (signbits == 0)
259                 return Count;
260             else
261                 return bsf(signbits);
262         }
263     }
264 }
265 unittest
266 {
267     // Find the index of the first difference (at index 6)
268     //                  v 
269     char[16] A = "Hello sun";
270     char[16] B = "Hello moon";
271 
272     __m128i mmA = _mm_loadu_si128(cast(__m128i*)A.ptr);
273     __m128i mmB = _mm_loadu_si128(cast(__m128i*)B.ptr);
274 
275     int index = _mm_cmpestri!(_SIDD_UBYTE_OPS
276                             | _SIDD_CMP_EQUAL_EACH
277                             | _SIDD_NEGATIVE_POLARITY
278                             | _SIDD_LEAST_SIGNIFICANT)(mmA, 9, mmB, 10);
279     assert(index == 6);
280 
281     // Those string must compare equal, regardless of what happens after their length.
282     index = _mm_cmpestri!(_SIDD_UBYTE_OPS
283                         | _SIDD_CMP_EQUAL_EACH
284                         | _SIDD_NEGATIVE_POLARITY
285                         | _SIDD_LEAST_SIGNIFICANT)(mmA, 6, mmB, 6); // only look first six chars
286     assert(index == 16);
287 
288     index = _mm_cmpestri!(_SIDD_UBYTE_OPS
289                         | _SIDD_CMP_EQUAL_EACH
290                         | _SIDD_NEGATIVE_POLARITY
291                         | _SIDD_MOST_SIGNIFICANT)(mmA, 6, mmB, 6); // only look first six chars
292     assert(index == 16);
293 }
294 unittest
295 {
296     // Identify the last character that isn't an identifier character.
297     //                   v (at index 7)
298     char[16] A = "my_i(en)ifie";
299     char[16] identRanges = "__azAz09";
300     __m128i mmA = _mm_loadu_si128(cast(__m128i*)A.ptr);
301     __m128i mmI = _mm_loadu_si128(cast(__m128i*)identRanges.ptr);
302     byte16 mask = cast(byte16)_mm_cmpestrm!(_SIDD_UBYTE_OPS
303                                             | _SIDD_CMP_RANGES
304                                             | _SIDD_MASKED_NEGATIVE_POLARITY
305                                             | _SIDD_UNIT_MASK)(mmI, 8, mmA, 12);
306     byte[16] correctM = [0, 0, 0, 0, -1, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0];
307     assert(mask.array == correctM);
308 
309     int index = _mm_cmpestri!(_SIDD_UBYTE_OPS
310                             | _SIDD_CMP_RANGES
311                             | _SIDD_MASKED_NEGATIVE_POLARITY
312                             | _SIDD_MOST_SIGNIFICANT)(mmI, 8, mmA, 12);
313     assert(index == 7); // ')' is the last char not to be in [__azAz09]
314 }
315 unittest
316 {
317     // testing _SIDD_CMP_RANGES but with signed shorts comparison instead (this only makes sense for _SIDD_CMP_RANGES)
318     short[8] ranges  = [0,  -1,  1000, 2000,    0,    0,    0, 0];
319     short[8] numbers = [-32768, -1000, -1, -0, 0, 1, 1000, 32767];
320     __m128i mmRanges = _mm_loadu_si128(cast(__m128i*)ranges.ptr);
321     __m128i mmNumbers = _mm_loadu_si128(cast(__m128i*)numbers.ptr);
322 
323     short8 mask = cast(short8)_mm_cmpestrm!(_SIDD_UWORD_OPS
324                                           | _SIDD_CMP_RANGES
325                                           | _SIDD_UNIT_MASK)(mmRanges, 4, mmNumbers, 8);
326     short[8] correctM = [ -1, -1, -1, -1, -1, -1, -1, -1];
327     mask = cast(short8)_mm_cmpestrm!(_SIDD_SWORD_OPS
328                                    | _SIDD_CMP_RANGES
329                                    | _SIDD_UNIT_MASK)(mmRanges, 4, mmNumbers, 8);
330     short[8] correctZ = [ 0, 0, 0, 0, 0, 0, -1, 0];
331     assert(mask.array == correctZ);
332 }
333 unittest
334 {
335     // Find a substring
336     char[16] A = "def";
337     char[16] B = "abcdefghdefff";
338     char[16] C = "no substring";
339     __m128i mmA = _mm_loadu_si128(cast(__m128i*)A.ptr);
340     __m128i mmB = _mm_loadu_si128(cast(__m128i*)B.ptr);
341     __m128i mmC = _mm_loadu_si128(cast(__m128i*)C.ptr);
342 
343     byte16 mask = cast(byte16)_mm_cmpestrm!(_SIDD_UBYTE_OPS
344                                             | _SIDD_CMP_EQUAL_ORDERED
345                                             | _SIDD_UNIT_MASK)(mmA, 3, mmB, 13);
346     byte[16] correctM = [0, 0, 0, -1, 0, 0, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0];
347     assert(mask.array == correctM);
348 
349     int firstMatch = _mm_cmpestri!(_SIDD_UBYTE_OPS
350                                  | _SIDD_CMP_EQUAL_ORDERED)(mmA, 3, mmB, 13);
351     assert(firstMatch == 3);
352 
353     int lastMatch = _mm_cmpestri!(_SIDD_UBYTE_OPS
354                                  | _SIDD_CMP_EQUAL_ORDERED
355                                  | _SIDD_MOST_SIGNIFICANT)(mmA, 3, mmB, 13);
356     assert(lastMatch == 8);
357     firstMatch = _mm_cmpestri!(_SIDD_UBYTE_OPS
358                                  | _SIDD_CMP_EQUAL_ORDERED)(mmA, -3, mmC, -12);
359     assert(firstMatch == 16); // no substring found
360 }
361 
362 /// Compare packed strings in `a` and `b` with lengths `la` and `lb` using 
363 /// the control in `imm8`, and return the generated mask.
364 __m128i _mm_cmpestrm(int imm8)(__m128i a, int la, __m128i b, int lb) @trusted
365 {
366     static if (GDC_with_SSE42)
367     {
368         return cast(__m128i) __builtin_ia32_pcmpestrm128(cast(ubyte16)a, la, cast(ubyte16)b, lb, imm8);
369     }
370     else static if (LDC_with_SSE42)
371     {
372         return cast(__m128i) __builtin_ia32_pcmpestrm128(cast(byte16)a, la, cast(byte16)b, lb, imm8);
373     }
374     else
375     {
376         __m128i mask = cmpstrMaskExplicit!imm8(a, la, b, lb);
377         
378         static if (imm8 & _SIDD_UNIT_MASK)
379         {
380             return mask;
381         }
382         else
383         {
384             // _SIDD_BIT_MASK
385             static if (imm8 & 1)
386             {
387                 mask = _mm_packs_epi16(mask, _mm_setzero_si128());
388             }
389             return _mm_cvtsi32_si128( _mm_movemask_epi8(mask));
390         }
391     }
392 }
393 unittest
394 {
395     char[16] A = "Hello world!";
396     char[16] B = "aeiou!";
397     __m128i mmA = _mm_loadu_si128(cast(__m128i*)A.ptr);
398     __m128i mmB = _mm_loadu_si128(cast(__m128i*)B.ptr);
399 
400     // Find which letters from B where found in A.
401     byte16 R = cast(byte16)_mm_cmpestrm!(_SIDD_UBYTE_OPS 
402                                        | _SIDD_CMP_EQUAL_ANY
403                                        | _SIDD_BIT_MASK)(mmA, -12, mmB, -6);
404     // because 'e', 'o', and '!' were found
405     byte[16] correctR = [42, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0];
406     assert(R.array == correctR);
407     byte16 M = cast(byte16) _mm_cmpestrm!(_SIDD_UBYTE_OPS 
408                                         | _SIDD_CMP_EQUAL_ANY
409                                         | _SIDD_UNIT_MASK)(mmA, 12, mmB, 6);
410     byte[16] correctM = [0, -1, 0, -1, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0];
411     assert(M.array == correctM);
412 }
413 
414 /// Compare packed strings in `a` and `b` with lengths `la` and `lb` using 
415 /// the control in `imm8`, and returns bit 0 of the resulting bit mask.
416 int _mm_cmpestro(int imm8)(__m128i a, int la, __m128i b, int lb) @trusted
417 {
418     static if (GDC_with_SSE42)
419     {
420         return __builtin_ia32_pcmpestrio128(cast(ubyte16)a, la, cast(ubyte16)b, lb, imm8);
421     }
422     else static if (LDC_with_SSE42)
423     {
424         return __builtin_ia32_pcmpestrio128(cast(byte16)a, la, cast(byte16)b, lb, imm8);
425     }
426     else
427     {
428         int4 mask = cast(int4) cmpstrMaskExplicit!imm8(a, la, b, lb);
429         return mask.array[0] & 1;
430     }
431 }
432 unittest
433 {
434     char[16] A = "Hallo world!";
435     char[16] B = "aeiou!";
436     __m128i mmA = _mm_loadu_si128(cast(__m128i*)A.ptr);
437     __m128i mmB = _mm_loadu_si128(cast(__m128i*)B.ptr);
438 
439     // Find which letters from B where found in A.
440     int res = _mm_cmpestro!(_SIDD_UBYTE_OPS 
441                           | _SIDD_CMP_EQUAL_ANY
442                           | _SIDD_BIT_MASK)(mmA, 12, mmB, -6);
443     // because 'a' was found in "Hallo world!"
444     assert(res == 1);
445 }
446 
447 /// Returns 1 if "any character in a was null", and 0 otherwise.
448 /// Warning: what they mean is it returns 1 if the given length `la` is < Count.
449 int _mm_cmpestrs(int imm8)(__m128i a, int la, __m128i b, int lb) @trusted
450 {
451     static if (GDC_with_SSE42)
452     {
453         return __builtin_ia32_pcmpestris128(cast(ubyte16)a, la, cast(ubyte16)b, lb, imm8);
454     }
455     else static if (LDC_with_SSE42)
456     {
457         return __builtin_ia32_pcmpestris128(cast(byte16)a, la, cast(byte16)b, lb, imm8);
458     }
459     else
460     {
461         // Yes, this intrinsic is there for symmetrical reasons and probably useless.
462         // saturates lengths (the Intrinsics Guide doesn't tell this)
463         if (la < 0) la = -la;
464         if (la > 16) la = 16;
465         enum int Count = (imm8 & 1) ? 8 : 16;
466         return (la < Count);
467     }
468 }
469 unittest
470 {
471     __m128i a;
472     a = 0;
473     assert(_mm_cmpestrs!_SIDD_UBYTE_OPS(a, 15, a, 8) == 1);
474     assert(_mm_cmpestrs!_SIDD_UBYTE_OPS(a, 16, a, 8) == 0);
475     assert(_mm_cmpestrs!_SIDD_UBYTE_OPS(a, -15, a, 8) == 1);
476     assert(_mm_cmpestrs!_SIDD_UBYTE_OPS(a, -16, a, 8) == 0);
477 }
478 
479 /// Returns 1 if "any character in b was null", and 0 otherwise.
480 /// Warning: what they mean is it returns 1 if the given length `lb` is < Count.
481 int _mm_cmpestrz(int imm8)(__m128i a, int la, __m128i b, int lb) @trusted
482 {
483     static if (GDC_with_SSE42)
484     {
485         return __builtin_ia32_pcmpestriz128(cast(ubyte16)a, la, cast(ubyte16)b, lb, imm8);
486     }
487     else static if (LDC_with_SSE42)
488     {
489         return __builtin_ia32_pcmpestriz128(cast(byte16)a, la, cast(byte16)b, lb, imm8);
490     }
491     else
492     {
493         // Yes, this intrinsic is there for symmetrical reasons and probably useless.
494         // saturates lengths (the Intrinsics Guide doesn't tell this)
495         if (lb < 0) lb = -lb;
496         if (lb > 16) lb = 16;
497         enum int Count = (imm8 & 1) ? 8 : 16;
498         return (lb < Count);
499     }
500 }
501 unittest
502 {
503     __m128i b;
504     b = 0;
505     assert(_mm_cmpestrs!_SIDD_UBYTE_OPS(b, 15, b, 15) == 1);
506     assert(_mm_cmpestrs!_SIDD_UBYTE_OPS(b, 16, b, 16) == 0);
507     assert(_mm_cmpestrs!_SIDD_UBYTE_OPS(b, -15, b, -15) == 1);
508     assert(_mm_cmpestrs!_SIDD_UBYTE_OPS(b, -16, b, -16) == 0);
509 }
510 
511 /// Compare packed signed 64-bit integers in a and b for greater-than.
512 __m128i _mm_cmpgt_epi64 (__m128i a, __m128i b) @trusted
513 {
514     long2 la = cast(long2)a;
515     long2 lb = cast(long2)b;
516     static if (GDC_with_SSE42)
517     {
518         return cast(__m128i) __builtin_ia32_pcmpgtq(la, lb);
519     }
520     else version(LDC)
521     {
522         // LDC x86: Optimized since LDC 1.1.0 -O1
523         //   arm64: Optimized since LDC 1.8.0 -O1
524         // When SSE4.2 is disabled, this gives same sequence than below.
525         return cast(__m128i)( greaterMask!long2(la, lb));
526     }
527     else
528     {        
529         long2 r;
530         r.ptr[0] = (la.array[0] > lb.array[0]) ? 0xffffffff_ffffffff : 0;
531         r.ptr[1] = (la.array[1] > lb.array[1]) ? 0xffffffff_ffffffff : 0;
532         return cast(__m128i)r;  
533     }
534 }
535 unittest
536 {
537     __m128i A = _mm_setr_epi64(-3,  2);
538     __m128i B = _mm_setr_epi64(4, -2);
539     long[2] correct = [ 0, -1 ];
540     long2 R = cast(long2)(_mm_cmpgt_epi32(A, B));
541     assert(R.array == correct);
542 }
543 
544 /// Compare packed strings with implicit lengths in `a` and `b` using the control in `imm8`,
545 /// and returns 1 if `b` did not contain a null character and the resulting mask was zero, 
546 /// and 0 otherwise.
547 int _mm_cmpistra(int imm8)(__m128i a, __m128i b) @trusted
548 {
549     static if (GDC_with_SSE42)
550     {
551         return cast(int) __builtin_ia32_pcmpistria128(cast(ubyte16)a, cast(ubyte16)b, imm8);
552     }
553     else static if (LDC_with_SSE42)
554     {
555         return __builtin_ia32_pcmpistria128(cast(byte16)a, cast(byte16)b, imm8);
556     }
557     else
558     {
559         static if (imm8 & 1)
560         {
561             int la = findLengthShort(a);
562             int lb = findLengthShort(b);
563         }
564         else
565         {
566             int la = findLengthByte(a);
567             int lb = findLengthByte(b);
568         }
569         return _mm_cmpestra!imm8(a, la, b, lb);
570     }
571 }
572 unittest
573 {
574     char[16] A = "Maximum\x00one";
575     char[16] B = "Maximum\x00four";
576     char[16] C = "Mbximum\x00length!";
577     __m128i mmA = _mm_loadu_si128(cast(__m128i*)A.ptr);
578     __m128i mmB = _mm_loadu_si128(cast(__m128i*)B.ptr);
579     __m128i mmC = _mm_loadu_si128(cast(__m128i*)C.ptr);
580 
581     // string matching a-la strcmp, for 16-bytes of data
582     // Use _SIDD_NEGATIVE_POLARITY since mask must be null, and all match must be one
583     assert(0 == _mm_cmpistra!(_SIDD_UBYTE_OPS 
584                             | _SIDD_CMP_EQUAL_EACH
585                             | _SIDD_MASKED_NEGATIVE_POLARITY)(mmA, mmB)); // match, but b is too short
586 
587     assert(0 == _mm_cmpistra!(_SIDD_UBYTE_OPS 
588                             | _SIDD_CMP_EQUAL_EACH
589                             | _SIDD_NEGATIVE_POLARITY)(mmA, mmC)); // do not match
590 }
591 
592 /// Compare packed strings with implicit lengths in `a` and `b` using the control in `imm8`,
593 /// and returns 1 if the resulting mask was non-zero, and 0 otherwise.
594 int _mm_cmpistrc(int imm8)(__m128i a, __m128i b) @trusted
595 {
596     static if (GDC_with_SSE42)
597     {
598         return cast(int) __builtin_ia32_pcmpistric128(cast(ubyte16)a, cast(ubyte16)b, imm8);
599     }
600     else static if (LDC_with_SSE42)
601     {
602         return cast(int) __builtin_ia32_pcmpistric128(cast(byte16)a, cast(byte16)b, imm8);
603     }
604     else
605     {
606         static if (imm8 & 1)
607         {
608             int la = findLengthShort(a);
609             int lb = findLengthShort(b);
610         }
611         else
612         {
613             int la = findLengthByte(a);
614             int lb = findLengthByte(b);
615         }
616         return _mm_cmpestrc!imm8(a, la, b, lb);
617     }
618 }
619 unittest
620 {
621     // Compare two shorter strings
622     {
623         char[16] A = "Hello";
624         char[16] B = "Hello moon";
625         __m128i mmA = _mm_loadu_si128(cast(__m128i*)A.ptr);
626         __m128i mmB = _mm_loadu_si128(cast(__m128i*)B.ptr);
627         assert(0 == _mm_cmpistrc!(_SIDD_UBYTE_OPS  // match gives 0 like strcmp
628                                 | _SIDD_CMP_EQUAL_EACH
629                                 | _SIDD_NEGATIVE_POLARITY)(mmA, mmA));
630         assert(1 == _mm_cmpistrc!(_SIDD_UBYTE_OPS 
631                                 | _SIDD_CMP_EQUAL_EACH
632                                 | _SIDD_NEGATIVE_POLARITY)(mmA, mmB));
633     }
634 }
635 
636 /// Compare packed strings with implicit lengths in `a` and `b` using the control in `imm8`
637 /// and return the generated index.
638 /// Note: if the mask is all zeroes, the returned index is always `Count` 
639 /// (8 or 16 depending on size).
640 int _mm_cmpistri(int imm8)(__m128i a, __m128i b) @trusted
641 {
642     static if (GDC_with_SSE42)
643     {
644         return __builtin_ia32_pcmpistri128(cast(ubyte16)a, cast(ubyte16)b, imm8);
645     }
646     else static if (LDC_with_SSE42)
647     {
648         return __builtin_ia32_pcmpistri128(cast(byte16)a, cast(byte16)b, imm8);
649     }
650     else
651     {
652         static if (imm8 & 1)
653         {
654             int la = findLengthShort(a);
655             int lb = findLengthShort(b);
656         }
657         else
658         {
659             int la = findLengthByte(a);
660             int lb = findLengthByte(b);
661         }
662         return _mm_cmpestri!imm8(a, la, b, lb);
663     }
664 }
665 unittest
666 {
667     // Identify the last character that isn't an identifier character.
668     //                   v (at index 7)
669     char[16] A = "my_i(en)ifie";
670     char[16] identRanges = "__azAz09";
671     __m128i mmA = _mm_loadu_si128(cast(__m128i*)A.ptr);
672     __m128i mmI = _mm_loadu_si128(cast(__m128i*)identRanges.ptr);
673     byte16 mask = cast(byte16)_mm_cmpistrm!(_SIDD_UBYTE_OPS
674                                             | _SIDD_CMP_RANGES
675                                             | _SIDD_MASKED_NEGATIVE_POLARITY
676                                             | _SIDD_UNIT_MASK)(mmI, mmA);
677     byte[16] correctM = [0, 0, 0, 0, -1, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0];
678     assert(mask.array == correctM);
679 
680     int index = _mm_cmpistri!(_SIDD_UBYTE_OPS
681                             | _SIDD_CMP_RANGES
682                             | _SIDD_MASKED_NEGATIVE_POLARITY
683                             | _SIDD_MOST_SIGNIFICANT)(mmI, mmA);
684     assert(index == 7); // ')' is the last char not to be in [__azAz09]
685 }
686 
687 /// Compare packed strings with implicit lengths in `a` and `b` using the control in
688 /// `imm8`, and return the generated mask.
689 __m128i _mm_cmpistrm(int imm8)(__m128i a, __m128i b) @trusted
690 {
691     static if (GDC_with_SSE42)
692     {
693         return cast(__m128i) __builtin_ia32_pcmpistrm128(cast(ubyte16)a, cast(ubyte16)b, imm8);
694     }
695     else static if (LDC_with_SSE42)
696     {
697         return cast(__m128i) __builtin_ia32_pcmpistrm128(cast(byte16)a, cast(byte16)b, imm8);
698     }
699     else
700     {
701         static if (imm8 & 1)
702         {
703             int la = findLengthShort(a);
704             int lb = findLengthShort(b);
705         }
706         else
707         {
708             int la = findLengthByte(a);
709             int lb = findLengthByte(b);
710         }
711         return _mm_cmpestrm!imm8(a, la, b, lb);
712     }
713 }
714 unittest
715 {
716     char[16] A = "Hello world!";
717     char[16] B = "aeiou!";
718     __m128i mmA = _mm_loadu_si128(cast(__m128i*)A.ptr);
719     __m128i mmB = _mm_loadu_si128(cast(__m128i*)B.ptr);
720 
721     // Find which letters from B where found in A.
722     byte16 R = cast(byte16)_mm_cmpistrm!(_SIDD_UBYTE_OPS 
723                                        | _SIDD_CMP_EQUAL_ANY
724                                        | _SIDD_BIT_MASK)(mmA, mmB);
725     // because 'e', 'o', and '!' were found
726     byte[16] correctR = [42, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0];
727     assert(R.array == correctR);
728     byte16 M = cast(byte16) _mm_cmpistrm!(_SIDD_UBYTE_OPS 
729                                         | _SIDD_CMP_EQUAL_ANY
730                                         | _SIDD_UNIT_MASK)(mmA, mmB);
731     byte[16] correctM = [0, -1, 0, -1, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0];
732     assert(M.array == correctM);
733 }
734 
735 /// Compare packed strings in `a` and `b` with lengths `la` and `lb` using 
736 /// the control in `imm8`, and returns bit 0 of the resulting bit mask.
737 int _mm_cmpistro(int imm8)(__m128i a, __m128i b) @trusted
738 {
739     static if (GDC_with_SSE42)
740     {
741         return __builtin_ia32_pcmpistrio128(cast(ubyte16)a, cast(ubyte16)b, imm8);
742     }
743     else static if (LDC_with_SSE42)
744     {
745         return __builtin_ia32_pcmpistrio128(cast(byte16)a, cast(byte16)b, imm8);
746     }
747     else
748     {
749         static if (imm8 & 1)
750         {
751             int la = findLengthShort(a);
752             int lb = findLengthShort(b);
753         }
754         else
755         {
756             int la = findLengthByte(a);
757             int lb = findLengthByte(b);
758         }
759         return _mm_cmpestro!imm8(a, la, b, lb);
760     }
761 }
762 unittest
763 {
764     char[16] A = "Hallo world!";
765     char[16] B = "aeiou!";
766     char[16] C = "Z";
767     __m128i mmA = _mm_loadu_si128(cast(__m128i*)A.ptr);
768     __m128i mmB = _mm_loadu_si128(cast(__m128i*)B.ptr);
769     __m128i mmC = _mm_loadu_si128(cast(__m128i*)C.ptr);
770 
771     // Find which letters from B where found in A.
772     int res = _mm_cmpistro!(_SIDD_UBYTE_OPS 
773                           | _SIDD_CMP_EQUAL_ANY
774                           | _SIDD_BIT_MASK)(mmA, mmB);
775     // because 'a' was found in "Hallo world!"
776     assert(res == 1);
777     res = _mm_cmpistro!(_SIDD_UBYTE_OPS 
778                       | _SIDD_CMP_EQUAL_ANY
779                       | _SIDD_BIT_MASK)(mmA, mmC);
780     assert(res == 0); // because 'Z' wasn't found in A
781 }
782 
783 /// Returns 1 if any character in `a` was null, and 0 otherwise.
784 int _mm_cmpistrs(int imm8)(__m128i a, __m128i b) @trusted
785 {
786     static if (GDC_with_SSE42)
787     {
788         return __builtin_ia32_pcmpistris128(cast(ubyte16)a, cast(ubyte16)b, imm8);
789     }
790     else static if (LDC_with_SSE42)
791     {
792         return __builtin_ia32_pcmpistris128(cast(byte16)a, cast(byte16)b, imm8);
793     }
794     else
795     {
796         static if (imm8 & 1)
797         {
798             int la = findLengthShort(a);
799             return la != 8;
800         }
801         else
802         {
803             int la = findLengthByte(a);
804             return la != 16;
805         }
806     }
807 }
808 unittest
809 {
810     char[16] A = "";
811     char[16] B = "hello";
812     char[16] C = "Maximum length!!";
813     __m128i mmA = _mm_loadu_si128(cast(__m128i*)A.ptr);
814     __m128i mmB = _mm_loadu_si128(cast(__m128i*)B.ptr);
815     __m128i mmC = _mm_loadu_si128(cast(__m128i*)C.ptr);
816     assert(_mm_cmpistrs!_SIDD_UBYTE_OPS(mmA, mmA) == 1);
817     assert(_mm_cmpistrs!_SIDD_SBYTE_OPS(mmB, mmB) == 1);
818     assert(_mm_cmpistrs!_SIDD_UWORD_OPS(mmC, mmC) == 0);
819 }
820 
821 /// Returns 1 if any character in `b` was null, and 0 otherwise.
822 int _mm_cmpistrz(int imm8)(__m128i a, __m128i b) @trusted
823 {
824     static if (GDC_with_SSE42)
825     {
826         return __builtin_ia32_pcmpistriz128(cast(ubyte16)a, cast(ubyte16)b, imm8);
827     }
828     else static if (LDC_with_SSE42)
829     {
830         return __builtin_ia32_pcmpistriz128(cast(byte16)a, cast(byte16)b, imm8);
831     }
832     else
833     {
834         static if (imm8 & 1)
835         {
836             int lb = findLengthShort(b);
837             return lb != 8;
838         }
839         else
840         {
841             int lb = findLengthByte(b);
842             return lb != 16;
843         }
844     }
845 }
846 unittest
847 {
848     char[16] A = "";
849     char[16] B = "hello";
850     char[16] C = "Maximum length!!";
851     __m128i mmA = _mm_loadu_si128(cast(__m128i*)A.ptr);
852     __m128i mmB = _mm_loadu_si128(cast(__m128i*)B.ptr);
853     __m128i mmC = _mm_loadu_si128(cast(__m128i*)C.ptr);
854     assert(_mm_cmpistrz!_SIDD_UBYTE_OPS(mmC, mmA) == 1);
855     assert(_mm_cmpistrz!_SIDD_SBYTE_OPS(mmC, mmB) == 1);
856     assert(_mm_cmpistrz!_SIDD_UWORD_OPS(mmA, mmC) == 0);
857 }
858 
859 
860 /// Starting with the initial value in `crc`, accumulates a CR32 value 
861 /// for unsigned 16-bit integer `v`.
862 /// Warning: this is computing CRC-32C (Castagnoli), not CRC-32.
863 uint _mm_crc32_u16 (uint crc, ushort v) @safe
864 {
865     static if (GDC_with_SSE42)
866     {
867         return __builtin_ia32_crc32hi(crc, v);
868     }
869     else static if (LDC_with_SSE42)
870     {
871         return __builtin_ia32_crc32hi(crc, v);
872     }
873     else static if (LDC_with_ARM64_CRC)
874     {
875         return __crc32ch(crc, v);
876     }
877     else
878     {
879         crc = _mm_crc32_u8(crc, v & 0xff);
880         crc = _mm_crc32_u8(crc, v >> 8);
881         return crc;
882     }
883 }
884 unittest
885 {
886     uint A = _mm_crc32_u16(0x12345678, 0x4512);
887     uint B = _mm_crc32_u16(0x76543210, 0xf50f);
888     uint C = _mm_crc32_u16(0xDEADBEEF, 0x0017);
889     assert(A == 0x39c3f0ff);
890     assert(B == 0xcffbcf07);
891     assert(C == 0xc7e3fe85);
892 }
893 
894 /// Starting with the initial value in `crc`, accumulates a CRC32 value 
895 /// for unsigned 32-bit integer `v`.
896 /// Warning: this is computing CRC-32C (Castagnoli), not CRC-32.
897 uint _mm_crc32_u32 (uint crc, uint v) @safe
898 {
899     static if (GDC_with_SSE42)
900     {
901         return __builtin_ia32_crc32si(crc, v);
902     }
903     else static if (LDC_with_SSE42)
904     {
905         return __builtin_ia32_crc32si(crc, v);
906     }
907     else static if (LDC_with_ARM64_CRC)
908     {
909         return __crc32cw(crc, v);
910     }
911     else
912     {
913         crc = _mm_crc32_u8(crc, v & 0xff);
914         crc = _mm_crc32_u8(crc, (v >> 8) & 0xff);
915         crc = _mm_crc32_u8(crc, (v >> 16) & 0xff);
916         crc = _mm_crc32_u8(crc, (v >> 24) & 0xff);
917         return crc;
918     }
919 }
920 unittest
921 {
922     uint A = _mm_crc32_u32(0x12345678, 0x45123563);
923     uint B = _mm_crc32_u32(0x76543210, 0xf50f9993);
924     uint C = _mm_crc32_u32(0xDEADBEEF, 0x00170017);
925     assert(A == 0x22a6ec54);
926     assert(B == 0x7019a6cf);
927     assert(C == 0xbc552c27);
928 }
929 
930 /// Starting with the initial value in `crc`, accumulates a CRC32 
931 /// value for unsigned 64-bit integer `v`.
932 /// Warning: this is computing CRC-32C (Castagnoli), not CRC-32.
933 ulong _mm_crc32_u64 (ulong crc, ulong v)
934 {
935     version(X86_64)
936         enum bool hasX86Intrin = GDC_with_SSE42 || LDC_with_SSE42;
937     else
938         enum bool hasX86Intrin = false; // intrinsics not available in 32-bit
939 
940     static if (hasX86Intrin)
941     {
942         return __builtin_ia32_crc32di(crc, v);
943     }
944     else static if (LDC_with_ARM64_CRC)
945     {
946         return __crc32cd(cast(uint)crc, v);
947     }
948     else
949     {
950         uint crc32 = cast(uint)crc;
951         crc32 = _mm_crc32_u8(crc32, (v >> 0) & 0xff);
952         crc32 = _mm_crc32_u8(crc32, (v >> 8) & 0xff);
953         crc32 = _mm_crc32_u8(crc32, (v >> 16) & 0xff);
954         crc32 = _mm_crc32_u8(crc32, (v >> 24) & 0xff);
955         crc32 = _mm_crc32_u8(crc32, (v >> 32) & 0xff);
956         crc32 = _mm_crc32_u8(crc32, (v >> 40) & 0xff);
957         crc32 = _mm_crc32_u8(crc32, (v >> 48) & 0xff);
958         crc32 = _mm_crc32_u8(crc32, (v >> 56) & 0xff);
959         return crc32;
960     }
961 }
962 unittest
963 {
964     ulong A = _mm_crc32_u64(0x1234567812345678, 0x39C3F0FFCFFBCF07);
965     ulong B = _mm_crc32_u64(0x7654321001234567, 0xFACEFEED);
966     ulong C = _mm_crc32_u64(0xDEADBEEFCAFEBABE, 0x0017C7E3FE850017);
967     assert(A == 0xd66b1074);
968     assert(B == 0xac12f9c6);
969     assert(C == 0xa2d13dd8);
970 }
971 
972 /// Starting with the initial value in `crc`, accumulates a CRC32 value 
973 /// for unsigned 8-bit integer `v`.
974 /// Warning: this is computing CRC-32C (Castagnoli), not CRC-32.
975 uint _mm_crc32_u8 (uint crc, ubyte v) @safe
976 {
977     static if (GDC_with_SSE42)
978     {
979         return __builtin_ia32_crc32qi(crc, v);
980     }
981     else static if (LDC_with_SSE42)
982     {
983         return __builtin_ia32_crc32qi(crc, v);
984     }
985     else static if (LDC_with_ARM64_CRC)
986     {
987         return __crc32cb(crc, v);
988     }
989     else
990     {
991         return CRC32cTable[(crc ^ v) & 0xFF] ^ (crc >> 8); 
992     }
993 }
994 unittest
995 {
996     uint A = _mm_crc32_u8(0x12345678, 0x45);
997     uint B = _mm_crc32_u8(0x76543210, 0xf5);
998     uint C = _mm_crc32_u8(0xDEADBEEF, 0x00);
999     assert(A == 0x8fd93134);
1000     assert(B == 0xd6b7e834);
1001     assert(C == 0xbdfd3980);
1002 }
1003 
1004 
1005 // Utilities for this file
1006 
1007 private:
1008 
1009 static if (GDC_with_SSE42)
1010 {
1011     version(X86_64)
1012         enum bool NeedCRC32CTable = false;
1013     else
1014         enum bool NeedCRC32CTable = true;
1015 }
1016 else static if (LDC_with_SSE42)
1017 {
1018     version(X86_64)
1019         enum bool NeedCRC32CTable = false;
1020     else
1021         enum bool NeedCRC32CTable = true;
1022 }
1023 else static if (LDC_with_ARM64_CRC)
1024 {
1025     enum bool NeedCRC32CTable = false;
1026 }
1027 else
1028 {
1029     enum bool NeedCRC32CTable = true;
1030 }
1031 
1032 static if (NeedCRC32CTable)
1033 {
1034     static immutable uint[256] CRC32cTable =
1035     [
1036         0x0, 0xf26b8303, 0xe13b70f7, 0x1350f3f4, 0xc79a971f, 0x35f1141c, 0x26a1e7e8, 0xd4ca64eb,
1037         0x8ad958cf, 0x78b2dbcc, 0x6be22838, 0x9989ab3b, 0x4d43cfd0, 0xbf284cd3, 0xac78bf27, 0x5e133c24,
1038         0x105ec76f, 0xe235446c, 0xf165b798, 0x30e349b, 0xd7c45070, 0x25afd373, 0x36ff2087, 0xc494a384,
1039         0x9a879fa0, 0x68ec1ca3, 0x7bbcef57, 0x89d76c54, 0x5d1d08bf, 0xaf768bbc, 0xbc267848, 0x4e4dfb4b,
1040         0x20bd8ede, 0xd2d60ddd, 0xc186fe29, 0x33ed7d2a, 0xe72719c1, 0x154c9ac2, 0x61c6936, 0xf477ea35,
1041         0xaa64d611, 0x580f5512, 0x4b5fa6e6, 0xb93425e5, 0x6dfe410e, 0x9f95c20d, 0x8cc531f9, 0x7eaeb2fa,
1042         0x30e349b1, 0xc288cab2, 0xd1d83946, 0x23b3ba45, 0xf779deae, 0x5125dad, 0x1642ae59, 0xe4292d5a,
1043         0xba3a117e, 0x4851927d, 0x5b016189, 0xa96ae28a, 0x7da08661, 0x8fcb0562, 0x9c9bf696, 0x6ef07595,
1044         0x417b1dbc, 0xb3109ebf, 0xa0406d4b, 0x522bee48, 0x86e18aa3, 0x748a09a0, 0x67dafa54, 0x95b17957,
1045         0xcba24573, 0x39c9c670, 0x2a993584, 0xd8f2b687, 0xc38d26c, 0xfe53516f, 0xed03a29b, 0x1f682198,
1046         0x5125dad3, 0xa34e59d0, 0xb01eaa24, 0x42752927, 0x96bf4dcc, 0x64d4cecf, 0x77843d3b, 0x85efbe38,
1047         0xdbfc821c, 0x2997011f, 0x3ac7f2eb, 0xc8ac71e8, 0x1c661503, 0xee0d9600, 0xfd5d65f4, 0xf36e6f7,
1048         0x61c69362, 0x93ad1061, 0x80fde395, 0x72966096, 0xa65c047d, 0x5437877e, 0x4767748a, 0xb50cf789,
1049         0xeb1fcbad, 0x197448ae, 0xa24bb5a, 0xf84f3859, 0x2c855cb2, 0xdeeedfb1, 0xcdbe2c45, 0x3fd5af46,
1050         0x7198540d, 0x83f3d70e, 0x90a324fa, 0x62c8a7f9, 0xb602c312, 0x44694011, 0x5739b3e5, 0xa55230e6,
1051         0xfb410cc2, 0x92a8fc1, 0x1a7a7c35, 0xe811ff36, 0x3cdb9bdd, 0xceb018de, 0xdde0eb2a, 0x2f8b6829,
1052         0x82f63b78, 0x709db87b, 0x63cd4b8f, 0x91a6c88c, 0x456cac67, 0xb7072f64, 0xa457dc90, 0x563c5f93,
1053         0x82f63b7, 0xfa44e0b4, 0xe9141340, 0x1b7f9043, 0xcfb5f4a8, 0x3dde77ab, 0x2e8e845f, 0xdce5075c,
1054         0x92a8fc17, 0x60c37f14, 0x73938ce0, 0x81f80fe3, 0x55326b08, 0xa759e80b, 0xb4091bff, 0x466298fc,
1055         0x1871a4d8, 0xea1a27db, 0xf94ad42f, 0xb21572c, 0xdfeb33c7, 0x2d80b0c4, 0x3ed04330, 0xccbbc033,
1056         0xa24bb5a6, 0x502036a5, 0x4370c551, 0xb11b4652, 0x65d122b9, 0x97baa1ba, 0x84ea524e, 0x7681d14d,
1057         0x2892ed69, 0xdaf96e6a, 0xc9a99d9e, 0x3bc21e9d, 0xef087a76, 0x1d63f975, 0xe330a81, 0xfc588982,
1058         0xb21572c9, 0x407ef1ca, 0x532e023e, 0xa145813d, 0x758fe5d6, 0x87e466d5, 0x94b49521, 0x66df1622,
1059         0x38cc2a06, 0xcaa7a905, 0xd9f75af1, 0x2b9cd9f2, 0xff56bd19, 0xd3d3e1a, 0x1e6dcdee, 0xec064eed,
1060         0xc38d26c4, 0x31e6a5c7, 0x22b65633, 0xd0ddd530, 0x417b1db, 0xf67c32d8, 0xe52cc12c, 0x1747422f,
1061         0x49547e0b, 0xbb3ffd08, 0xa86f0efc, 0x5a048dff, 0x8ecee914, 0x7ca56a17, 0x6ff599e3, 0x9d9e1ae0,
1062         0xd3d3e1ab, 0x21b862a8, 0x32e8915c, 0xc083125f, 0x144976b4, 0xe622f5b7, 0xf5720643, 0x7198540,
1063         0x590ab964, 0xab613a67, 0xb831c993, 0x4a5a4a90, 0x9e902e7b, 0x6cfbad78, 0x7fab5e8c, 0x8dc0dd8f,
1064         0xe330a81a, 0x115b2b19, 0x20bd8ed, 0xf0605bee, 0x24aa3f05, 0xd6c1bc06, 0xc5914ff2, 0x37faccf1,
1065         0x69e9f0d5, 0x9b8273d6, 0x88d28022, 0x7ab90321, 0xae7367ca, 0x5c18e4c9, 0x4f48173d, 0xbd23943e,
1066         0xf36e6f75, 0x105ec76, 0x12551f82, 0xe03e9c81, 0x34f4f86a, 0xc69f7b69, 0xd5cf889d, 0x27a40b9e,
1067         0x79b737ba, 0x8bdcb4b9, 0x988c474d, 0x6ae7c44e, 0xbe2da0a5, 0x4c4623a6, 0x5f16d052, 0xad7d5351,
1068     ];
1069 }
1070 
1071 int findLengthByte(__m128i a) pure @safe
1072 {
1073     const __m128i zero = _mm_setzero_si128();
1074     const __m128i zeroMask = _mm_cmpeq_epi8(a, zero); // 0xff where a byte is zero
1075     int mask = _mm_movemask_epi8(zeroMask); // the lowest set bit is the zero index
1076     if (mask == 0)
1077         return 16;
1078     else
1079         return bsf(mask);
1080 }
1081 unittest
1082 {
1083     char[16] A = "Hel!o";
1084     char[16] B = "Maximum length!!";
1085     __m128i mmA = _mm_loadu_si128(cast(__m128i*)A.ptr);
1086     __m128i mmB = _mm_loadu_si128(cast(__m128i*)B.ptr);
1087     assert(findLengthByte(mmA) == 5);
1088     assert(findLengthByte(mmB) == 16);
1089 }
1090 
1091 int findLengthShort(__m128i a) pure @safe
1092 {
1093     const __m128i zero = _mm_setzero_si128();
1094     const __m128i zeroMask = _mm_cmpeq_epi16(a, zero); // 0xffff where a short is zero
1095     int mask = _mm_movemask_epi8(zeroMask); // the lowest set bit is the zero index
1096     if (mask == 0)
1097         return 8;
1098     else
1099         return bsf(mask) >> 1;
1100 }
1101 unittest
1102 {
1103     short[8] A = [10, 5423, 475, 0, 1, 1, 1, 1 ];
1104     short[8] B = [-1, -2, -3, 4, 5, 6, -32768, 1];
1105     __m128i mmA = _mm_loadu_si128(cast(__m128i*)A.ptr);
1106     __m128i mmB = _mm_loadu_si128(cast(__m128i*)B.ptr);
1107     assert(findLengthShort(mmA) == 3);
1108     assert(findLengthShort(mmB) == 8);
1109 }
1110 
1111 static immutable byte[32] MASK_DATA =
1112 [
1113     -1, -1, -1, -1, -1, -1, -1, -1,
1114     -1, -1, -1, -1, -1, -1, -1, -1,
1115      0,  0,  0,  0,  0,  0,  0,  0,
1116      0,  0,  0,  0,  0,  0,  0,  0,
1117 ];
1118 
1119 // Makes a byte validity mask with a given explicit length string.
1120 __m128i validMask8e(int len) @trusted
1121 {
1122     return _mm_loadu_si128(cast(__m128i*) &MASK_DATA[16-len]);
1123 }
1124 unittest
1125 {
1126     char[16] A = "";
1127     char[16] B = "0123456789abcdef";
1128     byte[16] correctA = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0];
1129     byte[16] correctB = [-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1];
1130     byte16 MA = cast(byte16) validMask8e(0);
1131     byte16 MB = cast(byte16) validMask8e(16);
1132     assert(MA.array == correctA);
1133     assert(MB.array == correctB);
1134 }
1135 
1136 // Makes a short validity mask with a given explicit length string.
1137 __m128i validMask16e(int len) @trusted
1138 {
1139     return _mm_loadu_si128(cast(__m128i*) &MASK_DATA[16-len*2]);
1140 }
1141 unittest
1142 {
1143     short[8] A = [3, 4, 5, 0, 3, 4, 5, 6];
1144     short[8] correctA = [-1, -1, -1, 0, 0, 0, 0, 0];
1145     short8 MA = cast(short8) validMask16e(3);
1146     assert(MA.array == correctA);
1147 }
1148 
1149 // Internal implementation for non-SSE4.2
1150 // Compare 8-bit or 16-bit strings, get a mask.
1151 // `aValid` and `bValid` are byte-mask or word-mask of the valid
1152 // zone in `a` and `b`.
1153 __m128i cmpstrMaskExplicit(int imm8)(__m128i a, 
1154                                      ref int la, 
1155                                      __m128i b, 
1156                                      ref int lb) @safe
1157 {
1158     // saturates lengths (the Intrinsics Guide doesn't tell this)
1159     if (la < 0) la = -la;
1160     if (lb < 0) lb = -lb;
1161     if (la > 16) la = 16;
1162     if (lb > 16) lb = 16;
1163 
1164     static if (imm8 & 1)
1165     {
1166         __m128i aValid = validMask16e(la);
1167         __m128i bValid = validMask16e(lb);
1168     }
1169     else
1170     {
1171         __m128i aValid = validMask8e(la);
1172         __m128i bValid = validMask8e(lb);
1173     }
1174     return cmpstrMask!imm8(a, aValid, b, bValid);
1175 }
1176 
1177 //ditto
1178 __m128i cmpstrMask(int imm8)(__m128i a, 
1179                              __m128i aValid, 
1180                              __m128i b, 
1181                              const __m128i bValid) @safe
1182 {
1183     enum bool chars16Bits = imm8 & 1;
1184     enum int Mode = (imm8 >> 2) & 3;
1185 
1186     static if (Mode == 0) // equal any
1187     {
1188         __m128i R = _mm_setzero_si128();
1189         static if (chars16Bits) // 64 comparisons
1190         {
1191             for (int k = 0; k < 8; ++k)
1192             {
1193                 __m128i equalMask = _mm_cmpeq_epi16(a, b);
1194                 equalMask = _mm_and_si128(equalMask, aValid);
1195                 R = _mm_or_si128(R, equalMask);
1196 
1197                 // rotate a and aValid
1198                 a = _mm_or_si128(_mm_srli_si128!2(a), _mm_slli_si128!14(a));
1199                 aValid = _mm_or_si128(_mm_srli_si128!2(aValid), _mm_slli_si128!14(aValid));
1200             }
1201         }
1202         else
1203         {
1204             for (int k = 0; k < 16; ++k)
1205             {
1206                 __m128i equalMask = _mm_cmpeq_epi8(a, b);
1207                 equalMask = _mm_and_si128(equalMask, aValid);
1208                 R = _mm_or_si128(R, equalMask);
1209 
1210                 // rotate a and aValid
1211                 a = _mm_or_si128(_mm_srli_si128!1(a), _mm_slli_si128!15(a));
1212                 aValid = _mm_or_si128(_mm_srli_si128!1(aValid), _mm_slli_si128!15(aValid));
1213             }
1214         }
1215         R = _mm_and_si128(R, bValid);
1216     }
1217     else static if (Mode == 1) // ranges
1218     {
1219         enum bool signed = (imm8 & 2) != 0;
1220 
1221         // For each character in b, the returned mask says if it was found in a range-pair in `a`.
1222         __m128i R = _mm_setzero_si128();
1223         static if (chars16Bits)
1224         {
1225             for (int pos = 0; pos < 8; pos += 2)
1226             {
1227                 short min = (cast(short8)a).array[pos];
1228                 short max = (cast(short8)a).array[pos+1];
1229                 static if (signed)
1230                 {
1231                     __m128i ge = ~_mm_cmplt_epi16(b, _mm_set1_epi16(min));
1232                     __m128i le = ~_mm_cmpgt_epi16(b, _mm_set1_epi16(max));
1233                 }
1234                 else
1235                 {
1236                     // No SSE way to do 16-bit unsigned comparisons, 
1237                     // but flipping the sign bit let us used signed comp
1238                     __m128i firstBits = _mm_set1_epi16(-32768);
1239                     __m128i reverseB = _mm_xor_si128(b, firstBits);
1240                     __m128i reverseMin = _mm_xor_si128(_mm_set1_epi16(min), firstBits);
1241                     __m128i reverseMax = _mm_xor_si128(_mm_set1_epi16(max), firstBits);
1242                     __m128i ge = ~_mm_cmplt_epi16(reverseB, reverseMin);
1243                     __m128i le = ~_mm_cmpgt_epi16(reverseB, reverseMax);
1244                 }
1245                 __m128i inRange = _mm_and_si128(le, ge);
1246 
1247                 // Not considered in range a is invalid here.
1248                 short aValidHere = (cast(short8)aValid).array[pos+1];
1249                 __m128i mmAValidHere = _mm_set1_epi16(aValidHere);
1250                 inRange = _mm_and_si128(inRange, mmAValidHere); 
1251 
1252                 R = _mm_or_si128(R, inRange);
1253             }            
1254         }
1255         else // 8-bits
1256         {
1257             for (int pos = 0; pos < 16; pos += 2)
1258             {
1259                 byte min = (cast(byte16)a).array[pos];
1260                 byte max = (cast(byte16)a).array[pos+1];
1261                 static if (signed)
1262                 {
1263                     __m128i ge = _mm_xor_si128(_mm_cmplt_epi8(b, _mm_set1_epi8(min)));
1264                     __m128i le = _mm_xor_si128(_mm_cmpgt_epi8(b, _mm_set1_epi8(max)));
1265                 }
1266                 else
1267                 {
1268                     // No SSE way to do 16-bit unsigned comparisons, 
1269                     // but flipping the sign bit let us used signed comp
1270                     __m128i firstBits = _mm_set1_epi8(-128);
1271                     __m128i reverseB = _mm_xor_si128(b, firstBits);
1272                     __m128i reverseMin = _mm_xor_si128(_mm_set1_epi8(min), firstBits);
1273                     __m128i reverseMax = _mm_xor_si128(_mm_set1_epi8(max), firstBits);
1274                     __m128i ge = ~_mm_cmplt_epi8(reverseB, reverseMin);
1275                     __m128i le = ~_mm_cmpgt_epi8(reverseB, reverseMax);
1276                 }
1277                 __m128i inRange = _mm_and_si128(le, ge);
1278 
1279                 // Not considered in range a is invalid here.
1280                 byte aValidHere = (cast(byte16)aValid).array[pos+1];
1281                 __m128i mmAValidHere = _mm_set1_epi8(aValidHere);
1282                 inRange = _mm_and_si128(inRange, mmAValidHere); 
1283 
1284                 R = _mm_or_si128(R, inRange);
1285             }
1286         }
1287         // invalid b part is not in range
1288         R = _mm_and_si128(R, bValid);
1289     }
1290     else static if (Mode == 2) // equal each, just 16 comparisons not 256
1291     {
1292         static if (chars16Bits)
1293         {
1294             __m128i R = _mm_cmpeq_epi16(a, b);
1295         }
1296         else
1297         {
1298             __m128i R = _mm_cmpeq_epi8(a, b);
1299         }
1300 
1301         // if only a or b is invalid, consider not equal
1302         R = _mm_andnot_si128(_mm_xor_si128(aValid, bValid), R);
1303 
1304         // if a and b are both invalid, consider equal
1305         R = _mm_or_si128(R, ~_mm_or_si128(aValid, bValid));
1306     }  
1307     else static if (Mode == 3) // equal ordered
1308     {
1309         // a is searched in b.
1310 
1311         __m128i bValidShift = bValid;
1312 
1313         __m128i R = _mm_set1_epi32(-1); // all b positions possible for containing a
1314         static if (chars16Bits)
1315         {
1316             for (int pos = 0; pos < 8; ++pos)
1317             {
1318                 // compare character k of a, where can it go in b?
1319                 short charK = (cast(short8)a).array[pos];
1320                 __m128i mmcharK = _mm_set1_epi16(charK);
1321 
1322                 short aValidHere = (cast(short8)aValid).array[pos];
1323                 __m128i mmAValidHere = _mm_set1_epi16(aValidHere);
1324                 __m128i mmAInvalidHere = _mm_xor_si128(mmAValidHere, _mm_set1_epi32(-1));
1325                 __m128i equalMask = _mm_cmpeq_epi16(mmcharK, b);
1326 
1327                 // Where A is invalid, the comparison always holds "equal"
1328                 equalMask = _mm_or_si128(equalMask, mmAInvalidHere);
1329 
1330                 // Where B is invalid, and A is valid, the comparison is forced to false
1331                 equalMask = _mm_and_si128(equalMask, _mm_or_si128(bValidShift, mmAInvalidHere));
1332 
1333                 R = _mm_and_si128(equalMask);
1334 
1335                 // drop first char of b
1336                 b = _mm_srli_si128!2(b);
1337                 bValidShift = _mm_srli_si128!2(bValidShift);
1338             }
1339         }
1340         else
1341         {
1342             for (int pos = 0; pos < 16; ++pos)
1343             {
1344                 // compare character k of a, where can it go in b?
1345                 byte charK = (cast(byte16)a).array[pos];
1346                 __m128i mmcharK = _mm_set1_epi8(charK);
1347 
1348                 byte aValidHere = (cast(byte16)aValid).array[pos];            
1349                 __m128i mmAValidHere = _mm_set1_epi8(aValidHere);
1350                 __m128i mmAInvalidHere = _mm_xor_si128(mmAValidHere, _mm_set1_epi32(-1));
1351                 __m128i equalMask = _mm_cmpeq_epi8(mmcharK, b);
1352 
1353                 // Where A is invalid, the comparison always holds "equal"
1354                 equalMask = _mm_or_si128(equalMask, mmAInvalidHere);
1355 
1356                 // Where B is invalid, and A is valid, the comparison is forced to false
1357                 equalMask = _mm_and_si128(equalMask, _mm_or_si128(bValidShift, mmAInvalidHere));
1358 
1359                 R = _mm_and_si128(R, equalMask);
1360 
1361                 // drop first char of b
1362                 b = _mm_srli_si128!1(b);
1363                 bValidShift = _mm_srli_si128!1(bValidShift);
1364             }
1365         }
1366     }
1367     else 
1368         static assert(0);
1369 
1370     // Optionally negate result
1371     static if (imm8 & _SIDD_NEGATIVE_POLARITY)
1372     {
1373         static if (imm8 & _SIDD_MASKED_POSITIVE_POLARITY) 
1374         {
1375             R = _mm_xor_si128(R, bValid); // only negate valid b
1376         }
1377         else
1378         {
1379             R = _mm_xor_si128(R, _mm_set1_epi32(-1)); // negate all
1380         }
1381     }
1382     return R;
1383 }