1 /**
2 * SSE4.2 intrinsics.
3 * https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#techs=SSSE3
4 *
5 * Copyright: Guillaume Piolat 2022.
6 * License:   $(LINK2 http://www.boost.org/LICENSE_1_0.txt, Boost License 1.0)
7 */
8 module inteli.nmmintrin;
9 
10 public import inteli.types;
11 import inteli.internals;
12 public import inteli.smmintrin;
13 import core.bitop: bsf, bsr;
14 
15 
16 // Note: this header will work whether you have SSE4.2 enabled or not.
17 // With LDC, use "dflags-ldc": ["-mattr=+sse4.2"] or equivalent to actively 
18 // generate SSE4.2 instruction (they are often enabled with -O1 or greater).
19 // Additionally, you need ["-mattr=+crc"] on ARM if you want hardware CRC instructions.
20 
21 nothrow @nogc:
22 
23 // <Data size and signedness>
24 
25 /// String contains unsigned 8-bit characters (default).
26 enum int _SIDD_UBYTE_OPS = 0;
27 
28 /// String contains unsigned 16-bit characters.
29 enum int _SIDD_UWORD_OPS = 1;
30 
31 /// String contains signed 8-bit characters.
32 enum int _SIDD_SBYTE_OPS = 2;
33 
34 /// String contains signed 16-bit characters.
35 enum int _SIDD_SWORD_OPS = 3;
36 
37 // </Data size and signedness>
38 
39 
40 // <Comparison options>
41 
42 /// For each character in `b`, find if it is in `a` (default)
43 /// The resulting mask has bit set at b positions that were found in a.
44 enum int _SIDD_CMP_EQUAL_ANY = 0;
45 
46 /// For each character in `b`, determine if
47 /// `a[0] <= c <= a[1] or a[1] <= c <= a[2]...`
48 /// Contrarily to false documentation on the Internet, pairs must be in `a`!
49 enum int _SIDD_CMP_RANGES = 4;
50 
51 /// The strings defined by `a` and `b` are equal
52 enum int _SIDD_CMP_EQUAL_EACH = 8;
53 
54 /// Search for the defined substring in the target
55 enum int _SIDD_CMP_EQUAL_ORDERED = 12;
56 
57 // </Comparison options>
58 
59 // <Result polarity>
60 
61 /// Do not negate results (default, no effect)
62 enum int _SIDD_POSITIVE_POLARITY = 0;
63 
64 /// Negates results
65 enum int _SIDD_NEGATIVE_POLARITY = 16;
66 
67 /// No effect. Do not negate results before the end of the string. (default when using `_SIDD_NEGATIVE_POLARITY`)
68 /// You basically never want this.
69 enum int _SIDD_MASKED_POSITIVE_POLARITY = 32;
70 
71 /// Negates results only before the end of the string
72 enum int _SIDD_MASKED_NEGATIVE_POLARITY = 48;
73 
74 // </Result polarity>
75 
76 // <Bit returned>
77 
78 /// **Index only**: return the least significant bit (default).
79 enum int _SIDD_LEAST_SIGNIFICANT = 0;
80 
81 /// **Index only**: return the most significant bit.
82 enum int _SIDD_MOST_SIGNIFICANT = 64;
83 
84 // </Bit returned>
85 
86 /// **Mask only**: return the bit mask (default).
87 enum int _SIDD_BIT_MASK = 0;
88 
89 /// **Mask only**: return the byte/word mask.
90 enum int _SIDD_UNIT_MASK = 64;
91 
92 /// So SSE4.2 has a lot of hard-to-understand instructions. Here is another explanations.
93 ///
94 /// Alternative explanation of imm8
95 ///
96 /// imm8 is an 8-bit immediate operand specifying whether the characters are bytes or
97 ///    words and the type of comparison to do.
98 ///
99 ///    Bits [1:0]: Determine source data format.
100 ///      00: 16 unsigned bytes
101 ///      01: 8 unsigned words
102 ///      10: 16 signed bytes
103 ///      11: 8 signed words
104 ///
105 ///    Bits [3:2]: Determine comparison type and aggregation method.
106 ///      00: Subset: Each character in B is compared for equality with all
107 ///          the characters in A.
108 ///      01: Ranges: Each character in B is compared to A pairs. The comparison
109 ///          basis is greater than or equal for even-indexed elements in A,
110 ///          and less than or equal for odd-indexed elements in A.
111 ///      10: Match: Compare each pair of corresponding characters in A and
112 ///          B for equality.
113 ///      11: Substring: Search B for substring matches of A.
114 ///
115 ///    Bits [5:4]: Determine whether to do a one's complement on the bit
116 ///                mask of the comparison results. \n
117 ///      00: No effect. \n
118 ///      01: Negate the bit mask. \n
119 ///      10: No effect. \n
120 ///      11: Negate the bit mask only for bits with an index less than or equal
121 ///          to the size of \a A or \a B.
122 ///
123 
124 
125 
126 /// Compare packed strings in `a` and `b` with lengths `la` and `lb` using 
127 /// the control in `imm8`, and returns 1 if `b` "does not contain a null character"
128 /// and the resulting mask was zero, and 0 otherwise.
129 /// Warning: actually it seems the instruction does accept \0 in input, just the length must be >= count.
130 ///          It's not clear for what purpose.
131 int _mm_cmpestra(int imm8)(__m128i a, int la, __m128i b, int lb) @trusted
132 {
133     static if (GDC_with_SSE42)
134     {
135         return cast(int) __builtin_ia32_pcmpestria128(cast(ubyte16)a, la, cast(ubyte16)b, lb, imm8);
136     }
137     else static if (LDC_with_SSE42)
138     {
139         return __builtin_ia32_pcmpestria128(cast(byte16)a, la, cast(byte16)b, lb, imm8);
140     }
141     else
142     {
143         __m128i mask = cmpstrMaskExplicit!imm8(a, la, b, lb);
144         __m128i equalZero = _mm_cmpeq_epi8(mask, _mm_setzero_si128());
145         int sigbits = _mm_movemask_epi8(equalZero);
146         enum int Count = (imm8 & 1) ? 8 : 16;
147         return (sigbits == 0xffff) && (lb >= Count);
148     }
149 }
150 unittest
151 {
152     char[16] A = "Maximum\x00length!!";
153     char[16] B = "Mbximum\x00length!!";
154     __m128i mmA = _mm_loadu_si128(cast(__m128i*)A.ptr);
155     __m128i mmB = _mm_loadu_si128(cast(__m128i*)B.ptr);
156 
157     // string matching a-la strcmp, for 16-bytes of data
158     // Use _SIDD_NEGATIVE_POLARITY since mask must be null, and all match must be one
159     assert(1 == _mm_cmpestra!(_SIDD_UBYTE_OPS 
160                             | _SIDD_CMP_EQUAL_EACH
161                             | _SIDD_NEGATIVE_POLARITY)(mmA, 16, mmA, 16));
162     assert(0 == _mm_cmpestra!(_SIDD_UBYTE_OPS 
163                             | _SIDD_CMP_EQUAL_EACH
164                             | _SIDD_NEGATIVE_POLARITY)(mmA, 16, mmB, 16));
165 
166     // test negative length, this will be clamped to 16
167     assert(1 == _mm_cmpestra!(_SIDD_UBYTE_OPS 
168                             | _SIDD_CMP_EQUAL_EACH
169                             | _SIDD_NEGATIVE_POLARITY)(mmA, -160, mmA, -17));
170 
171     // it seems you can't compare shorter strings for equality using _mm_cmpestra (!)
172 
173     // Test 16-bit format
174     assert(1 == _mm_cmpestra!(_SIDD_SWORD_OPS 
175                             | _SIDD_CMP_EQUAL_EACH
176                             | _SIDD_NEGATIVE_POLARITY)(mmA, 8, mmA, 8));
177 }
178 
179 /// Compare packed strings in `a` and `b` with lengths `la` and `lb` using 
180 /// the control in `imm8`, and returns 1 if the resulting mask was non-zero,
181 /// and 0 otherwise.
182 int _mm_cmpestrc(int imm8)(__m128i a, int la, __m128i b, int lb) @trusted
183 {
184     static if (GDC_with_SSE42)
185     {
186         return cast(int) __builtin_ia32_pcmpestric128(cast(ubyte16)a, la, cast(ubyte16)b, lb, imm8);
187     }
188     else static if (LDC_with_SSE42)
189     {
190         return cast(int) __builtin_ia32_pcmpestric128(cast(byte16)a, la, cast(byte16)b, lb, imm8);
191     }
192     else
193     {
194         __m128i mask = cmpstrMaskExplicit!imm8(a, la, b, lb);
195         int sigbits = _mm_movemask_epi8(mask);
196         return (sigbits != 0);
197     }
198 }
199 unittest
200 {
201     // Compare two shorter strings
202     {
203         char[16] A = "Hello world";
204         char[16] B = "Hello moon";
205         __m128i mmA = _mm_loadu_si128(cast(__m128i*)A.ptr);
206         __m128i mmB = _mm_loadu_si128(cast(__m128i*)B.ptr);
207         __m128i mask = _mm_cmpestrm!(_SIDD_UBYTE_OPS  // match gives 0 like strcmp
208                                      | _SIDD_CMP_EQUAL_EACH
209                                      | _SIDD_NEGATIVE_POLARITY)(mmA, 6, mmB, 6);
210         assert(0 == _mm_cmpestrc!(_SIDD_UBYTE_OPS  // match gives 0 like strcmp
211                                 | _SIDD_CMP_EQUAL_EACH
212                                 | _SIDD_NEGATIVE_POLARITY)(mmA, 6, mmB, 6));
213         assert(1 == _mm_cmpestrc!(_SIDD_UBYTE_OPS 
214                                 | _SIDD_CMP_EQUAL_EACH
215                                 | _SIDD_NEGATIVE_POLARITY)(mmA, 7, mmB, 7));
216     }
217 }
218 
219 /// Compare packed strings in `a` and `b` with lengths `la` and `lb` using
220 /// the control in `imm8`, and return the generated index.
221 /// Note: if the mask is all zeroes, the returned index is always `Count` 
222 /// (8 or 16 depending on size).
223 int _mm_cmpestri(int imm8)(__m128i a, int la, __m128i b, int lb) @trusted
224 {
225     static if (GDC_with_SSE42)
226     {
227         return __builtin_ia32_pcmpestri128(cast(ubyte16)a, la, cast(ubyte16)b, lb, imm8);
228     }
229     else static if (LDC_with_SSE42)
230     {
231         return __builtin_ia32_pcmpestri128(cast(byte16)a, la, cast(byte16)b, lb, imm8);
232     }
233     else
234     {
235         __m128i mask = cmpstrMaskExplicit!imm8(a, la, b, lb);
236 
237         // Convert the unit mask to bit mask
238         static if (imm8 & 1)
239         {
240             enum int Count = 8;
241             mask = _mm_packs_epi16(mask, _mm_setzero_si128());
242         }
243         else
244         {
245             enum int Count = 16;
246         }
247         int signbits = _mm_movemask_epi8(mask);
248         static if (imm8 & _SIDD_MOST_SIGNIFICANT)
249         {
250             if (signbits == 0)
251                 return Count;
252             else
253                 return bsr(signbits);
254         }
255         else
256         {
257             if (signbits == 0)
258                 return Count;
259             else
260                 return bsf(signbits);
261         }
262     }
263 }
264 unittest
265 {
266     // Find the index of the first difference (at index 6)
267     //                  v 
268     char[16] A = "Hello sun";
269     char[16] B = "Hello moon";
270 
271     __m128i mmA = _mm_loadu_si128(cast(__m128i*)A.ptr);
272     __m128i mmB = _mm_loadu_si128(cast(__m128i*)B.ptr);
273 
274     int index = _mm_cmpestri!(_SIDD_UBYTE_OPS
275                             | _SIDD_CMP_EQUAL_EACH
276                             | _SIDD_NEGATIVE_POLARITY
277                             | _SIDD_LEAST_SIGNIFICANT)(mmA, 9, mmB, 10);
278     assert(index == 6);
279 
280     // Those string must compare equal, regardless of what happens after their length.
281     index = _mm_cmpestri!(_SIDD_UBYTE_OPS
282                         | _SIDD_CMP_EQUAL_EACH
283                         | _SIDD_NEGATIVE_POLARITY
284                         | _SIDD_LEAST_SIGNIFICANT)(mmA, 6, mmB, 6); // only look first six chars
285     assert(index == 16);
286 
287     index = _mm_cmpestri!(_SIDD_UBYTE_OPS
288                         | _SIDD_CMP_EQUAL_EACH
289                         | _SIDD_NEGATIVE_POLARITY
290                         | _SIDD_MOST_SIGNIFICANT)(mmA, 6, mmB, 6); // only look first six chars
291     assert(index == 16);
292 }
293 unittest
294 {
295     // Identify the last character that isn't an identifier character.
296     //                   v (at index 7)
297     char[16] A = "my_i(en)ifie";
298     char[16] identRanges = "__azAz09";
299     __m128i mmA = _mm_loadu_si128(cast(__m128i*)A.ptr);
300     __m128i mmI = _mm_loadu_si128(cast(__m128i*)identRanges.ptr);
301     byte16 mask = cast(byte16)_mm_cmpestrm!(_SIDD_UBYTE_OPS
302                                             | _SIDD_CMP_RANGES
303                                             | _SIDD_MASKED_NEGATIVE_POLARITY
304                                             | _SIDD_UNIT_MASK)(mmI, 8, mmA, 12);
305     byte[16] correctM = [0, 0, 0, 0, -1, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0];
306     assert(mask.array == correctM);
307 
308     int index = _mm_cmpestri!(_SIDD_UBYTE_OPS
309                             | _SIDD_CMP_RANGES
310                             | _SIDD_MASKED_NEGATIVE_POLARITY
311                             | _SIDD_MOST_SIGNIFICANT)(mmI, 8, mmA, 12);
312     assert(index == 7); // ')' is the last char not to be in [__azAz09]
313 }
314 unittest
315 {
316     // testing _SIDD_CMP_RANGES but with signed shorts comparison instead (this only makes sense for _SIDD_CMP_RANGES)
317     short[8] ranges  = [0,  -1,  1000, 2000,    0,    0,    0, 0];
318     short[8] numbers = [-32768, -1000, -1, -0, 0, 1, 1000, 32767];
319     __m128i mmRanges = _mm_loadu_si128(cast(__m128i*)ranges.ptr);
320     __m128i mmNumbers = _mm_loadu_si128(cast(__m128i*)numbers.ptr);
321 
322     short8 mask = cast(short8)_mm_cmpestrm!(_SIDD_UWORD_OPS
323                                           | _SIDD_CMP_RANGES
324                                           | _SIDD_UNIT_MASK)(mmRanges, 4, mmNumbers, 8);
325     short[8] correctM = [ -1, -1, -1, -1, -1, -1, -1, -1];
326     mask = cast(short8)_mm_cmpestrm!(_SIDD_SWORD_OPS
327                                    | _SIDD_CMP_RANGES
328                                    | _SIDD_UNIT_MASK)(mmRanges, 4, mmNumbers, 8);
329     short[8] correctZ = [ 0, 0, 0, 0, 0, 0, -1, 0];
330     assert(mask.array == correctZ);
331 }
332 unittest
333 {
334     // Find a substring
335     char[16] A = "def";
336     char[16] B = "abcdefghdefff";
337     char[16] C = "no substring";
338     __m128i mmA = _mm_loadu_si128(cast(__m128i*)A.ptr);
339     __m128i mmB = _mm_loadu_si128(cast(__m128i*)B.ptr);
340     __m128i mmC = _mm_loadu_si128(cast(__m128i*)C.ptr);
341 
342     byte16 mask = cast(byte16)_mm_cmpestrm!(_SIDD_UBYTE_OPS
343                                             | _SIDD_CMP_EQUAL_ORDERED
344                                             | _SIDD_UNIT_MASK)(mmA, 3, mmB, 13);
345     byte[16] correctM = [0, 0, 0, -1, 0, 0, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0];
346     assert(mask.array == correctM);
347 
348     int firstMatch = _mm_cmpestri!(_SIDD_UBYTE_OPS
349                                  | _SIDD_CMP_EQUAL_ORDERED)(mmA, 3, mmB, 13);
350     assert(firstMatch == 3);
351 
352     int lastMatch = _mm_cmpestri!(_SIDD_UBYTE_OPS
353                                  | _SIDD_CMP_EQUAL_ORDERED
354                                  | _SIDD_MOST_SIGNIFICANT)(mmA, 3, mmB, 13);
355     assert(lastMatch == 8);
356     firstMatch = _mm_cmpestri!(_SIDD_UBYTE_OPS
357                                  | _SIDD_CMP_EQUAL_ORDERED)(mmA, -3, mmC, -12);
358     assert(firstMatch == 16); // no substring found
359 }
360 
361 /// Compare packed strings in `a` and `b` with lengths `la` and `lb` using 
362 /// the control in `imm8`, and return the generated mask.
363 __m128i _mm_cmpestrm(int imm8)(__m128i a, int la, __m128i b, int lb) @trusted
364 {
365     static if (GDC_with_SSE42)
366     {
367         return cast(__m128i) __builtin_ia32_pcmpestrm128(cast(ubyte16)a, la, cast(ubyte16)b, lb, imm8);
368     }
369     else static if (LDC_with_SSE42)
370     {
371         return cast(__m128i) __builtin_ia32_pcmpestrm128(cast(byte16)a, la, cast(byte16)b, lb, imm8);
372     }
373     else
374     {
375         __m128i mask = cmpstrMaskExplicit!imm8(a, la, b, lb);
376         
377         static if (imm8 & _SIDD_UNIT_MASK)
378         {
379             return mask;
380         }
381         else
382         {
383             // _SIDD_BIT_MASK
384             static if (imm8 & 1)
385             {
386                 mask = _mm_packs_epi16(mask, _mm_setzero_si128());
387             }
388             return _mm_cvtsi32_si128( _mm_movemask_epi8(mask));
389         }
390     }
391 }
392 unittest
393 {
394     char[16] A = "Hello world!";
395     char[16] B = "aeiou!";
396     __m128i mmA = _mm_loadu_si128(cast(__m128i*)A.ptr);
397     __m128i mmB = _mm_loadu_si128(cast(__m128i*)B.ptr);
398 
399     // Find which letters from B where found in A.
400     byte16 R = cast(byte16)_mm_cmpestrm!(_SIDD_UBYTE_OPS 
401                                        | _SIDD_CMP_EQUAL_ANY
402                                        | _SIDD_BIT_MASK)(mmA, -12, mmB, -6);
403     // because 'e', 'o', and '!' were found
404     byte[16] correctR = [42, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0];
405     assert(R.array == correctR);
406     byte16 M = cast(byte16) _mm_cmpestrm!(_SIDD_UBYTE_OPS 
407                                         | _SIDD_CMP_EQUAL_ANY
408                                         | _SIDD_UNIT_MASK)(mmA, 12, mmB, 6);
409     byte[16] correctM = [0, -1, 0, -1, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0];
410     assert(M.array == correctM);
411 }
412 
413 /// Compare packed strings in `a` and `b` with lengths `la` and `lb` using 
414 /// the control in `imm8`, and returns bit 0 of the resulting bit mask.
415 int _mm_cmpestro(int imm8)(__m128i a, int la, __m128i b, int lb) @trusted
416 {
417     static if (GDC_with_SSE42)
418     {
419         return __builtin_ia32_pcmpestrio128(cast(ubyte16)a, la, cast(ubyte16)b, lb, imm8);
420     }
421     else static if (LDC_with_SSE42)
422     {
423         return __builtin_ia32_pcmpestrio128(cast(byte16)a, la, cast(byte16)b, lb, imm8);
424     }
425     else
426     {
427         int4 mask = cast(int4) cmpstrMaskExplicit!imm8(a, la, b, lb);
428         return mask.array[0] & 1;
429     }
430 }
431 unittest
432 {
433     char[16] A = "Hallo world!";
434     char[16] B = "aeiou!";
435     __m128i mmA = _mm_loadu_si128(cast(__m128i*)A.ptr);
436     __m128i mmB = _mm_loadu_si128(cast(__m128i*)B.ptr);
437 
438     // Find which letters from B where found in A.
439     int res = _mm_cmpestro!(_SIDD_UBYTE_OPS 
440                           | _SIDD_CMP_EQUAL_ANY
441                           | _SIDD_BIT_MASK)(mmA, 12, mmB, -6);
442     // because 'a' was found in "Hallo world!"
443     assert(res == 1);
444 }
445 
446 /// Returns 1 if "any character in a was null", and 0 otherwise.
447 /// Warning: what they mean is it returns 1 if the given length `la` is < Count.
448 int _mm_cmpestrs(int imm8)(__m128i a, int la, __m128i b, int lb) @trusted
449 {
450     static if (GDC_with_SSE42)
451     {
452         return __builtin_ia32_pcmpestris128(cast(ubyte16)a, la, cast(ubyte16)b, lb, imm8);
453     }
454     else static if (LDC_with_SSE42)
455     {
456         return __builtin_ia32_pcmpestris128(cast(byte16)a, la, cast(byte16)b, lb, imm8);
457     }
458     else
459     {
460         // Yes, this intrinsic is there for symmetrical reasons and probably useless.
461         // saturates lengths (the Intrinsics Guide doesn't tell this)
462         if (la < 0) la = -la;
463         if (la > 16) la = 16;
464         enum int Count = (imm8 & 1) ? 8 : 16;
465         return (la < Count);
466     }
467 }
468 unittest
469 {
470     __m128i a;
471     a = 0;
472     assert(_mm_cmpestrs!_SIDD_UBYTE_OPS(a, 15, a, 8) == 1);
473     assert(_mm_cmpestrs!_SIDD_UBYTE_OPS(a, 16, a, 8) == 0);
474     assert(_mm_cmpestrs!_SIDD_UBYTE_OPS(a, -15, a, 8) == 1);
475     assert(_mm_cmpestrs!_SIDD_UBYTE_OPS(a, -16, a, 8) == 0);
476 }
477 
478 /// Returns 1 if "any character in b was null", and 0 otherwise.
479 /// Warning: what they mean is it returns 1 if the given length `lb` is < Count.
480 int _mm_cmpestrz(int imm8)(__m128i a, int la, __m128i b, int lb) @trusted
481 {
482     static if (GDC_with_SSE42)
483     {
484         return __builtin_ia32_pcmpestriz128(cast(ubyte16)a, la, cast(ubyte16)b, lb, imm8);
485     }
486     else static if (LDC_with_SSE42)
487     {
488         return __builtin_ia32_pcmpestriz128(cast(byte16)a, la, cast(byte16)b, lb, imm8);
489     }
490     else
491     {
492         // Yes, this intrinsic is there for symmetrical reasons and probably useless.
493         // saturates lengths (the Intrinsics Guide doesn't tell this)
494         if (lb < 0) lb = -lb;
495         if (lb > 16) lb = 16;
496         enum int Count = (imm8 & 1) ? 8 : 16;
497         return (lb < Count);
498     }
499 }
500 unittest
501 {
502     __m128i b;
503     b = 0;
504     assert(_mm_cmpestrs!_SIDD_UBYTE_OPS(b, 15, b, 15) == 1);
505     assert(_mm_cmpestrs!_SIDD_UBYTE_OPS(b, 16, b, 16) == 0);
506     assert(_mm_cmpestrs!_SIDD_UBYTE_OPS(b, -15, b, -15) == 1);
507     assert(_mm_cmpestrs!_SIDD_UBYTE_OPS(b, -16, b, -16) == 0);
508 }
509 
510 /// Compare packed signed 64-bit integers in a and b for greater-than.
511 __m128i _mm_cmpgt_epi64 (__m128i a, __m128i b) @trusted
512 {
513     long2 la = cast(long2)a;
514     long2 lb = cast(long2)b;
515     static if (GDC_with_SSE42)
516     {
517         return cast(__m128i) __builtin_ia32_pcmpgtq(la, lb);
518     }
519     else version(LDC)
520     {
521         // LDC x86: Optimized since LDC 1.1.0 -O1
522         //   arm64: Optimized since LDC 1.8.0 -O1
523         // When SSE4.2 is disabled, this gives same sequence than below.
524         return cast(__m128i)( greaterMask!long2(la, lb));
525     }
526     else
527     {        
528         long2 r;
529         r.ptr[0] = (la.array[0] > lb.array[0]) ? 0xffffffff_ffffffff : 0;
530         r.ptr[1] = (la.array[1] > lb.array[1]) ? 0xffffffff_ffffffff : 0;
531         return cast(__m128i)r;  
532     }
533 }
534 unittest
535 {
536     __m128i A = _mm_setr_epi64(-3,  2);
537     __m128i B = _mm_setr_epi64(4, -2);
538     long[2] correct = [ 0, -1 ];
539     long2 R = cast(long2)(_mm_cmpgt_epi32(A, B));
540     assert(R.array == correct);
541 }
542 
543 /// Compare packed strings with implicit lengths in `a` and `b` using the control in `imm8`,
544 /// and returns 1 if `b` did not contain a null character and the resulting mask was zero, 
545 /// and 0 otherwise.
546 int _mm_cmpistra(int imm8)(__m128i a, __m128i b) @trusted
547 {
548     static if (GDC_with_SSE42)
549     {
550         return cast(int) __builtin_ia32_pcmpistria128(cast(ubyte16)a, cast(ubyte16)b, imm8);
551     }
552     else static if (LDC_with_SSE42)
553     {
554         return __builtin_ia32_pcmpistria128(cast(byte16)a, cast(byte16)b, imm8);
555     }
556     else
557     {
558         static if (imm8 & 1)
559         {
560             int la = findLengthShort(a);
561             int lb = findLengthShort(b);
562         }
563         else
564         {
565             int la = findLengthByte(a);
566             int lb = findLengthByte(b);
567         }
568         return _mm_cmpestra!imm8(a, la, b, lb);
569     }
570 }
571 unittest
572 {
573     char[16] A = "Maximum\x00one";
574     char[16] B = "Maximum\x00four";
575     char[16] C = "Mbximum\x00length!";
576     __m128i mmA = _mm_loadu_si128(cast(__m128i*)A.ptr);
577     __m128i mmB = _mm_loadu_si128(cast(__m128i*)B.ptr);
578     __m128i mmC = _mm_loadu_si128(cast(__m128i*)C.ptr);
579 
580     // string matching a-la strcmp, for 16-bytes of data
581     // Use _SIDD_NEGATIVE_POLARITY since mask must be null, and all match must be one
582     assert(0 == _mm_cmpistra!(_SIDD_UBYTE_OPS 
583                             | _SIDD_CMP_EQUAL_EACH
584                             | _SIDD_MASKED_NEGATIVE_POLARITY)(mmA, mmB)); // match, but b is too short
585 
586     assert(0 == _mm_cmpistra!(_SIDD_UBYTE_OPS 
587                             | _SIDD_CMP_EQUAL_EACH
588                             | _SIDD_NEGATIVE_POLARITY)(mmA, mmC)); // do not match
589 }
590 
591 /// Compare packed strings with implicit lengths in `a` and `b` using the control in `imm8`,
592 /// and returns 1 if the resulting mask was non-zero, and 0 otherwise.
593 int _mm_cmpistrc(int imm8)(__m128i a, __m128i b) @trusted
594 {
595     static if (GDC_with_SSE42)
596     {
597         return cast(int) __builtin_ia32_pcmpistric128(cast(ubyte16)a, cast(ubyte16)b, imm8);
598     }
599     else static if (LDC_with_SSE42)
600     {
601         return cast(int) __builtin_ia32_pcmpistric128(cast(byte16)a, cast(byte16)b, imm8);
602     }
603     else
604     {
605         static if (imm8 & 1)
606         {
607             int la = findLengthShort(a);
608             int lb = findLengthShort(b);
609         }
610         else
611         {
612             int la = findLengthByte(a);
613             int lb = findLengthByte(b);
614         }
615         return _mm_cmpestrc!imm8(a, la, b, lb);
616     }
617 }
618 unittest
619 {
620     // Compare two shorter strings
621     {
622         char[16] A = "Hello";
623         char[16] B = "Hello moon";
624         __m128i mmA = _mm_loadu_si128(cast(__m128i*)A.ptr);
625         __m128i mmB = _mm_loadu_si128(cast(__m128i*)B.ptr);
626         assert(0 == _mm_cmpistrc!(_SIDD_UBYTE_OPS  // match gives 0 like strcmp
627                                 | _SIDD_CMP_EQUAL_EACH
628                                 | _SIDD_NEGATIVE_POLARITY)(mmA, mmA));
629         assert(1 == _mm_cmpistrc!(_SIDD_UBYTE_OPS 
630                                 | _SIDD_CMP_EQUAL_EACH
631                                 | _SIDD_NEGATIVE_POLARITY)(mmA, mmB));
632     }
633 }
634 
635 /// Compare packed strings with implicit lengths in `a` and `b` using the control in `imm8`
636 /// and return the generated index.
637 /// Note: if the mask is all zeroes, the returned index is always `Count` 
638 /// (8 or 16 depending on size).
639 int _mm_cmpistri(int imm8)(__m128i a, __m128i b) @trusted
640 {
641     static if (GDC_with_SSE42)
642     {
643         return __builtin_ia32_pcmpistri128(cast(ubyte16)a, cast(ubyte16)b, imm8);
644     }
645     else static if (LDC_with_SSE42)
646     {
647         return __builtin_ia32_pcmpistri128(cast(byte16)a, cast(byte16)b, imm8);
648     }
649     else
650     {
651         static if (imm8 & 1)
652         {
653             int la = findLengthShort(a);
654             int lb = findLengthShort(b);
655         }
656         else
657         {
658             int la = findLengthByte(a);
659             int lb = findLengthByte(b);
660         }
661         return _mm_cmpestri!imm8(a, la, b, lb);
662     }
663 }
664 unittest
665 {
666     // Identify the last character that isn't an identifier character.
667     //                   v (at index 7)
668     char[16] A = "my_i(en)ifie";
669     char[16] identRanges = "__azAz09";
670     __m128i mmA = _mm_loadu_si128(cast(__m128i*)A.ptr);
671     __m128i mmI = _mm_loadu_si128(cast(__m128i*)identRanges.ptr);
672     byte16 mask = cast(byte16)_mm_cmpistrm!(_SIDD_UBYTE_OPS
673                                             | _SIDD_CMP_RANGES
674                                             | _SIDD_MASKED_NEGATIVE_POLARITY
675                                             | _SIDD_UNIT_MASK)(mmI, mmA);
676     byte[16] correctM = [0, 0, 0, 0, -1, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0];
677     assert(mask.array == correctM);
678 
679     int index = _mm_cmpistri!(_SIDD_UBYTE_OPS
680                             | _SIDD_CMP_RANGES
681                             | _SIDD_MASKED_NEGATIVE_POLARITY
682                             | _SIDD_MOST_SIGNIFICANT)(mmI, mmA);
683     assert(index == 7); // ')' is the last char not to be in [__azAz09]
684 }
685 
686 /// Compare packed strings with implicit lengths in `a` and `b` using the control in
687 /// `imm8`, and return the generated mask.
688 __m128i _mm_cmpistrm(int imm8)(__m128i a, __m128i b) @trusted
689 {
690     static if (GDC_with_SSE42)
691     {
692         return cast(__m128i) __builtin_ia32_pcmpistrm128(cast(ubyte16)a, cast(ubyte16)b, imm8);
693     }
694     else static if (LDC_with_SSE42)
695     {
696         return cast(__m128i) __builtin_ia32_pcmpistrm128(cast(byte16)a, cast(byte16)b, imm8);
697     }
698     else
699     {
700         static if (imm8 & 1)
701         {
702             int la = findLengthShort(a);
703             int lb = findLengthShort(b);
704         }
705         else
706         {
707             int la = findLengthByte(a);
708             int lb = findLengthByte(b);
709         }
710         return _mm_cmpestrm!imm8(a, la, b, lb);
711     }
712 }
713 unittest
714 {
715     char[16] A = "Hello world!";
716     char[16] B = "aeiou!";
717     __m128i mmA = _mm_loadu_si128(cast(__m128i*)A.ptr);
718     __m128i mmB = _mm_loadu_si128(cast(__m128i*)B.ptr);
719 
720     // Find which letters from B where found in A.
721     byte16 R = cast(byte16)_mm_cmpistrm!(_SIDD_UBYTE_OPS 
722                                        | _SIDD_CMP_EQUAL_ANY
723                                        | _SIDD_BIT_MASK)(mmA, mmB);
724     // because 'e', 'o', and '!' were found
725     byte[16] correctR = [42, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0];
726     assert(R.array == correctR);
727     byte16 M = cast(byte16) _mm_cmpistrm!(_SIDD_UBYTE_OPS 
728                                         | _SIDD_CMP_EQUAL_ANY
729                                         | _SIDD_UNIT_MASK)(mmA, mmB);
730     byte[16] correctM = [0, -1, 0, -1, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0];
731     assert(M.array == correctM);
732 }
733 
734 /// Compare packed strings in `a` and `b` with lengths `la` and `lb` using 
735 /// the control in `imm8`, and returns bit 0 of the resulting bit mask.
736 int _mm_cmpistro(int imm8)(__m128i a, __m128i b) @trusted
737 {
738     static if (GDC_with_SSE42)
739     {
740         return __builtin_ia32_pcmpistrio128(cast(ubyte16)a, cast(ubyte16)b, imm8);
741     }
742     else static if (LDC_with_SSE42)
743     {
744         return __builtin_ia32_pcmpistrio128(cast(byte16)a, cast(byte16)b, imm8);
745     }
746     else
747     {
748         static if (imm8 & 1)
749         {
750             int la = findLengthShort(a);
751             int lb = findLengthShort(b);
752         }
753         else
754         {
755             int la = findLengthByte(a);
756             int lb = findLengthByte(b);
757         }
758         return _mm_cmpestro!imm8(a, la, b, lb);
759     }
760 }
761 unittest
762 {
763     char[16] A = "Hallo world!";
764     char[16] B = "aeiou!";
765     char[16] C = "Z";
766     __m128i mmA = _mm_loadu_si128(cast(__m128i*)A.ptr);
767     __m128i mmB = _mm_loadu_si128(cast(__m128i*)B.ptr);
768     __m128i mmC = _mm_loadu_si128(cast(__m128i*)C.ptr);
769 
770     // Find which letters from B where found in A.
771     int res = _mm_cmpistro!(_SIDD_UBYTE_OPS 
772                           | _SIDD_CMP_EQUAL_ANY
773                           | _SIDD_BIT_MASK)(mmA, mmB);
774     // because 'a' was found in "Hallo world!"
775     assert(res == 1);
776     res = _mm_cmpistro!(_SIDD_UBYTE_OPS 
777                       | _SIDD_CMP_EQUAL_ANY
778                       | _SIDD_BIT_MASK)(mmA, mmC);
779     assert(res == 0); // because 'Z' wasn't found in A
780 }
781 
782 /// Returns 1 if any character in `a` was null, and 0 otherwise.
783 int _mm_cmpistrs(int imm8)(__m128i a, __m128i b) @trusted
784 {
785     static if (GDC_with_SSE42)
786     {
787         return __builtin_ia32_pcmpistris128(cast(ubyte16)a, cast(ubyte16)b, imm8);
788     }
789     else static if (LDC_with_SSE42)
790     {
791         return __builtin_ia32_pcmpistris128(cast(byte16)a, cast(byte16)b, imm8);
792     }
793     else
794     {
795         static if (imm8 & 1)
796         {
797             int la = findLengthShort(a);
798             return la != 8;
799         }
800         else
801         {
802             int la = findLengthByte(a);
803             return la != 16;
804         }
805     }
806 }
807 unittest
808 {
809     char[16] A = "";
810     char[16] B = "hello";
811     char[16] C = "Maximum length!!";
812     __m128i mmA = _mm_loadu_si128(cast(__m128i*)A.ptr);
813     __m128i mmB = _mm_loadu_si128(cast(__m128i*)B.ptr);
814     __m128i mmC = _mm_loadu_si128(cast(__m128i*)C.ptr);
815     assert(_mm_cmpistrs!_SIDD_UBYTE_OPS(mmA, mmA) == 1);
816     assert(_mm_cmpistrs!_SIDD_SBYTE_OPS(mmB, mmB) == 1);
817     assert(_mm_cmpistrs!_SIDD_UWORD_OPS(mmC, mmC) == 0);
818 }
819 
820 /// Returns 1 if any character in `b` was null, and 0 otherwise.
821 int _mm_cmpistrz(int imm8)(__m128i a, __m128i b) @trusted
822 {
823     static if (GDC_with_SSE42)
824     {
825         return __builtin_ia32_pcmpistriz128(cast(ubyte16)a, cast(ubyte16)b, imm8);
826     }
827     else static if (LDC_with_SSE42)
828     {
829         return __builtin_ia32_pcmpistriz128(cast(byte16)a, cast(byte16)b, imm8);
830     }
831     else
832     {
833         static if (imm8 & 1)
834         {
835             int lb = findLengthShort(b);
836             return lb != 8;
837         }
838         else
839         {
840             int lb = findLengthByte(b);
841             return lb != 16;
842         }
843     }
844 }
845 unittest
846 {
847     char[16] A = "";
848     char[16] B = "hello";
849     char[16] C = "Maximum length!!";
850     __m128i mmA = _mm_loadu_si128(cast(__m128i*)A.ptr);
851     __m128i mmB = _mm_loadu_si128(cast(__m128i*)B.ptr);
852     __m128i mmC = _mm_loadu_si128(cast(__m128i*)C.ptr);
853     assert(_mm_cmpistrz!_SIDD_UBYTE_OPS(mmC, mmA) == 1);
854     assert(_mm_cmpistrz!_SIDD_SBYTE_OPS(mmC, mmB) == 1);
855     assert(_mm_cmpistrz!_SIDD_UWORD_OPS(mmA, mmC) == 0);
856 }
857 
858 
859 /// Starting with the initial value in `crc`, accumulates a CR32 value 
860 /// for unsigned 16-bit integer `v`.
861 /// Warning: this is computing CRC-32C (Castagnoli), not CRC-32.
862 uint _mm_crc32_u16 (uint crc, ushort v) @safe
863 {
864     static if (GDC_with_SSE42)
865     {
866         return __builtin_ia32_crc32hi(crc, v);
867     }
868     else static if (LDC_with_SSE42)
869     {
870         return __builtin_ia32_crc32hi(crc, v);
871     }
872     else static if (LDC_with_ARM64_CRC)
873     {
874         return __crc32ch(crc, v);
875     }
876     else
877     {
878         crc = _mm_crc32_u8(crc, v & 0xff);
879         crc = _mm_crc32_u8(crc, v >> 8);
880         return crc;
881     }
882 }
883 unittest
884 {
885     uint A = _mm_crc32_u16(0x12345678, 0x4512);
886     uint B = _mm_crc32_u16(0x76543210, 0xf50f);
887     uint C = _mm_crc32_u16(0xDEADBEEF, 0x0017);
888     assert(A == 0x39c3f0ff);
889     assert(B == 0xcffbcf07);
890     assert(C == 0xc7e3fe85);
891 }
892 
893 /// Starting with the initial value in `crc`, accumulates a CRC32 value 
894 /// for unsigned 32-bit integer `v`.
895 /// Warning: this is computing CRC-32C (Castagnoli), not CRC-32.
896 uint _mm_crc32_u32 (uint crc, uint v) @safe
897 {
898     static if (GDC_with_SSE42)
899     {
900         return __builtin_ia32_crc32si(crc, v);
901     }
902     else static if (LDC_with_SSE42)
903     {
904         return __builtin_ia32_crc32si(crc, v);
905     }
906     else static if (LDC_with_ARM64_CRC)
907     {
908         return __crc32cw(crc, v);
909     }
910     else
911     {
912         crc = _mm_crc32_u8(crc, v & 0xff);
913         crc = _mm_crc32_u8(crc, (v >> 8) & 0xff);
914         crc = _mm_crc32_u8(crc, (v >> 16) & 0xff);
915         crc = _mm_crc32_u8(crc, (v >> 24) & 0xff);
916         return crc;
917     }
918 }
919 unittest
920 {
921     uint A = _mm_crc32_u32(0x12345678, 0x45123563);
922     uint B = _mm_crc32_u32(0x76543210, 0xf50f9993);
923     uint C = _mm_crc32_u32(0xDEADBEEF, 0x00170017);
924     assert(A == 0x22a6ec54);
925     assert(B == 0x7019a6cf);
926     assert(C == 0xbc552c27);
927 }
928 
929 /// Starting with the initial value in `crc`, accumulates a CRC32 
930 /// value for unsigned 64-bit integer `v`.
931 /// Warning: this is computing CRC-32C (Castagnoli), not CRC-32.
932 ulong _mm_crc32_u64 (ulong crc, ulong v)
933 {
934     version(X86_64)
935         enum bool hasX86Intrin = GDC_with_SSE42 || LDC_with_SSE42;
936     else
937         enum bool hasX86Intrin = false; // intrinsics not available in 32-bit
938 
939     static if (hasX86Intrin)
940     {
941         return __builtin_ia32_crc32di(crc, v);
942     }
943     else static if (LDC_with_ARM64_CRC)
944     {
945         return __crc32cd(cast(uint)crc, v);
946     }
947     else
948     {
949         uint crc32 = cast(uint)crc;
950         crc32 = _mm_crc32_u8(crc32, (v >> 0) & 0xff);
951         crc32 = _mm_crc32_u8(crc32, (v >> 8) & 0xff);
952         crc32 = _mm_crc32_u8(crc32, (v >> 16) & 0xff);
953         crc32 = _mm_crc32_u8(crc32, (v >> 24) & 0xff);
954         crc32 = _mm_crc32_u8(crc32, (v >> 32) & 0xff);
955         crc32 = _mm_crc32_u8(crc32, (v >> 40) & 0xff);
956         crc32 = _mm_crc32_u8(crc32, (v >> 48) & 0xff);
957         crc32 = _mm_crc32_u8(crc32, (v >> 56) & 0xff);
958         return crc32;
959     }
960 }
961 unittest
962 {
963     ulong A = _mm_crc32_u64(0x1234567812345678, 0x39C3F0FFCFFBCF07);
964     ulong B = _mm_crc32_u64(0x7654321001234567, 0xFACEFEED);
965     ulong C = _mm_crc32_u64(0xDEADBEEFCAFEBABE, 0x0017C7E3FE850017);
966     assert(A == 0xd66b1074);
967     assert(B == 0xac12f9c6);
968     assert(C == 0xa2d13dd8);
969 }
970 
971 /// Starting with the initial value in `crc`, accumulates a CRC32 value 
972 /// for unsigned 8-bit integer `v`.
973 /// Warning: this is computing CRC-32C (Castagnoli), not CRC-32.
974 uint _mm_crc32_u8 (uint crc, ubyte v) @safe
975 {
976     static if (GDC_with_SSE42)
977     {
978         return __builtin_ia32_crc32qi(crc, v);
979     }
980     else static if (LDC_with_SSE42)
981     {
982         return __builtin_ia32_crc32qi(crc, v);
983     }
984     else static if (LDC_with_ARM64_CRC)
985     {
986         return __crc32cb(crc, v);
987     }
988     else
989     {
990         return CRC32cTable[(crc ^ v) & 0xFF] ^ (crc >> 8); 
991     }
992 }
993 unittest
994 {
995     uint A = _mm_crc32_u8(0x12345678, 0x45);
996     uint B = _mm_crc32_u8(0x76543210, 0xf5);
997     uint C = _mm_crc32_u8(0xDEADBEEF, 0x00);
998     assert(A == 0x8fd93134);
999     assert(B == 0xd6b7e834);
1000     assert(C == 0xbdfd3980);
1001 }
1002 
1003 
1004 // Utilities for this file
1005 
1006 private:
1007 
1008 static if (GDC_with_SSE42)
1009 {
1010     version(X86_64)
1011         enum bool NeedCRC32CTable = false;
1012     else
1013         enum bool NeedCRC32CTable = true;
1014 }
1015 else static if (LDC_with_SSE42)
1016 {
1017     version(X86_64)
1018         enum bool NeedCRC32CTable = false;
1019     else
1020         enum bool NeedCRC32CTable = true;
1021 }
1022 else static if (LDC_with_ARM64_CRC)
1023 {
1024     enum bool NeedCRC32CTable = false;
1025 }
1026 else
1027 {
1028     enum bool NeedCRC32CTable = true;
1029 }
1030 
1031 static if (NeedCRC32CTable)
1032 {
1033     static immutable uint[256] CRC32cTable =
1034     [
1035         0x0, 0xf26b8303, 0xe13b70f7, 0x1350f3f4, 0xc79a971f, 0x35f1141c, 0x26a1e7e8, 0xd4ca64eb,
1036         0x8ad958cf, 0x78b2dbcc, 0x6be22838, 0x9989ab3b, 0x4d43cfd0, 0xbf284cd3, 0xac78bf27, 0x5e133c24,
1037         0x105ec76f, 0xe235446c, 0xf165b798, 0x30e349b, 0xd7c45070, 0x25afd373, 0x36ff2087, 0xc494a384,
1038         0x9a879fa0, 0x68ec1ca3, 0x7bbcef57, 0x89d76c54, 0x5d1d08bf, 0xaf768bbc, 0xbc267848, 0x4e4dfb4b,
1039         0x20bd8ede, 0xd2d60ddd, 0xc186fe29, 0x33ed7d2a, 0xe72719c1, 0x154c9ac2, 0x61c6936, 0xf477ea35,
1040         0xaa64d611, 0x580f5512, 0x4b5fa6e6, 0xb93425e5, 0x6dfe410e, 0x9f95c20d, 0x8cc531f9, 0x7eaeb2fa,
1041         0x30e349b1, 0xc288cab2, 0xd1d83946, 0x23b3ba45, 0xf779deae, 0x5125dad, 0x1642ae59, 0xe4292d5a,
1042         0xba3a117e, 0x4851927d, 0x5b016189, 0xa96ae28a, 0x7da08661, 0x8fcb0562, 0x9c9bf696, 0x6ef07595,
1043         0x417b1dbc, 0xb3109ebf, 0xa0406d4b, 0x522bee48, 0x86e18aa3, 0x748a09a0, 0x67dafa54, 0x95b17957,
1044         0xcba24573, 0x39c9c670, 0x2a993584, 0xd8f2b687, 0xc38d26c, 0xfe53516f, 0xed03a29b, 0x1f682198,
1045         0x5125dad3, 0xa34e59d0, 0xb01eaa24, 0x42752927, 0x96bf4dcc, 0x64d4cecf, 0x77843d3b, 0x85efbe38,
1046         0xdbfc821c, 0x2997011f, 0x3ac7f2eb, 0xc8ac71e8, 0x1c661503, 0xee0d9600, 0xfd5d65f4, 0xf36e6f7,
1047         0x61c69362, 0x93ad1061, 0x80fde395, 0x72966096, 0xa65c047d, 0x5437877e, 0x4767748a, 0xb50cf789,
1048         0xeb1fcbad, 0x197448ae, 0xa24bb5a, 0xf84f3859, 0x2c855cb2, 0xdeeedfb1, 0xcdbe2c45, 0x3fd5af46,
1049         0x7198540d, 0x83f3d70e, 0x90a324fa, 0x62c8a7f9, 0xb602c312, 0x44694011, 0x5739b3e5, 0xa55230e6,
1050         0xfb410cc2, 0x92a8fc1, 0x1a7a7c35, 0xe811ff36, 0x3cdb9bdd, 0xceb018de, 0xdde0eb2a, 0x2f8b6829,
1051         0x82f63b78, 0x709db87b, 0x63cd4b8f, 0x91a6c88c, 0x456cac67, 0xb7072f64, 0xa457dc90, 0x563c5f93,
1052         0x82f63b7, 0xfa44e0b4, 0xe9141340, 0x1b7f9043, 0xcfb5f4a8, 0x3dde77ab, 0x2e8e845f, 0xdce5075c,
1053         0x92a8fc17, 0x60c37f14, 0x73938ce0, 0x81f80fe3, 0x55326b08, 0xa759e80b, 0xb4091bff, 0x466298fc,
1054         0x1871a4d8, 0xea1a27db, 0xf94ad42f, 0xb21572c, 0xdfeb33c7, 0x2d80b0c4, 0x3ed04330, 0xccbbc033,
1055         0xa24bb5a6, 0x502036a5, 0x4370c551, 0xb11b4652, 0x65d122b9, 0x97baa1ba, 0x84ea524e, 0x7681d14d,
1056         0x2892ed69, 0xdaf96e6a, 0xc9a99d9e, 0x3bc21e9d, 0xef087a76, 0x1d63f975, 0xe330a81, 0xfc588982,
1057         0xb21572c9, 0x407ef1ca, 0x532e023e, 0xa145813d, 0x758fe5d6, 0x87e466d5, 0x94b49521, 0x66df1622,
1058         0x38cc2a06, 0xcaa7a905, 0xd9f75af1, 0x2b9cd9f2, 0xff56bd19, 0xd3d3e1a, 0x1e6dcdee, 0xec064eed,
1059         0xc38d26c4, 0x31e6a5c7, 0x22b65633, 0xd0ddd530, 0x417b1db, 0xf67c32d8, 0xe52cc12c, 0x1747422f,
1060         0x49547e0b, 0xbb3ffd08, 0xa86f0efc, 0x5a048dff, 0x8ecee914, 0x7ca56a17, 0x6ff599e3, 0x9d9e1ae0,
1061         0xd3d3e1ab, 0x21b862a8, 0x32e8915c, 0xc083125f, 0x144976b4, 0xe622f5b7, 0xf5720643, 0x7198540,
1062         0x590ab964, 0xab613a67, 0xb831c993, 0x4a5a4a90, 0x9e902e7b, 0x6cfbad78, 0x7fab5e8c, 0x8dc0dd8f,
1063         0xe330a81a, 0x115b2b19, 0x20bd8ed, 0xf0605bee, 0x24aa3f05, 0xd6c1bc06, 0xc5914ff2, 0x37faccf1,
1064         0x69e9f0d5, 0x9b8273d6, 0x88d28022, 0x7ab90321, 0xae7367ca, 0x5c18e4c9, 0x4f48173d, 0xbd23943e,
1065         0xf36e6f75, 0x105ec76, 0x12551f82, 0xe03e9c81, 0x34f4f86a, 0xc69f7b69, 0xd5cf889d, 0x27a40b9e,
1066         0x79b737ba, 0x8bdcb4b9, 0x988c474d, 0x6ae7c44e, 0xbe2da0a5, 0x4c4623a6, 0x5f16d052, 0xad7d5351,
1067     ];
1068 }
1069 
1070 int findLengthByte(__m128i a) pure @safe
1071 {
1072     const __m128i zero = _mm_setzero_si128();
1073     const __m128i zeroMask = _mm_cmpeq_epi8(a, zero); // 0xff where a byte is zero
1074     int mask = _mm_movemask_epi8(zeroMask); // the lowest set bit is the zero index
1075     if (mask == 0)
1076         return 16;
1077     else
1078         return bsf(mask);
1079 }
1080 unittest
1081 {
1082     char[16] A = "Hel!o";
1083     char[16] B = "Maximum length!!";
1084     __m128i mmA = _mm_loadu_si128(cast(__m128i*)A.ptr);
1085     __m128i mmB = _mm_loadu_si128(cast(__m128i*)B.ptr);
1086     assert(findLengthByte(mmA) == 5);
1087     assert(findLengthByte(mmB) == 16);
1088 }
1089 
1090 int findLengthShort(__m128i a) pure @safe
1091 {
1092     const __m128i zero = _mm_setzero_si128();
1093     const __m128i zeroMask = _mm_cmpeq_epi16(a, zero); // 0xffff where a short is zero
1094     int mask = _mm_movemask_epi8(zeroMask); // the lowest set bit is the zero index
1095     if (mask == 0)
1096         return 8;
1097     else
1098         return bsf(mask) >> 1;
1099 }
1100 unittest
1101 {
1102     short[8] A = [10, 5423, 475, 0, 1, 1, 1, 1 ];
1103     short[8] B = [-1, -2, -3, 4, 5, 6, -32768, 1];
1104     __m128i mmA = _mm_loadu_si128(cast(__m128i*)A.ptr);
1105     __m128i mmB = _mm_loadu_si128(cast(__m128i*)B.ptr);
1106     assert(findLengthShort(mmA) == 3);
1107     assert(findLengthShort(mmB) == 8);
1108 }
1109 
1110 static immutable byte[32] MASK_DATA =
1111 [
1112     -1, -1, -1, -1, -1, -1, -1, -1,
1113     -1, -1, -1, -1, -1, -1, -1, -1,
1114      0,  0,  0,  0,  0,  0,  0,  0,
1115      0,  0,  0,  0,  0,  0,  0,  0,
1116 ];
1117 
1118 // Makes a byte validity mask with a given explicit length string.
1119 __m128i validMask8e(int len) @trusted
1120 {
1121     return _mm_loadu_si128(cast(__m128i*) &MASK_DATA[16-len]);
1122 }
1123 unittest
1124 {
1125     char[16] A = "";
1126     char[16] B = "0123456789abcdef";
1127     byte[16] correctA = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0];
1128     byte[16] correctB = [-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1];
1129     byte16 MA = cast(byte16) validMask8e(0);
1130     byte16 MB = cast(byte16) validMask8e(16);
1131     assert(MA.array == correctA);
1132     assert(MB.array == correctB);
1133 }
1134 
1135 // Makes a short validity mask with a given explicit length string.
1136 __m128i validMask16e(int len) @trusted
1137 {
1138     return _mm_loadu_si128(cast(__m128i*) &MASK_DATA[16-len*2]);
1139 }
1140 unittest
1141 {
1142     short[8] A = [3, 4, 5, 0, 3, 4, 5, 6];
1143     short[8] correctA = [-1, -1, -1, 0, 0, 0, 0, 0];
1144     short8 MA = cast(short8) validMask16e(3);
1145     assert(MA.array == correctA);
1146 }
1147 
1148 // Internal implementation for non-SSE4.2
1149 // Compare 8-bit or 16-bit strings, get a mask.
1150 // `aValid` and `bValid` are byte-mask or word-mask of the valid
1151 // zone in `a` and `b`.
1152 __m128i cmpstrMaskExplicit(int imm8)(__m128i a, 
1153                                      ref int la, 
1154                                      __m128i b, 
1155                                      ref int lb) @safe
1156 {
1157     // saturates lengths (the Intrinsics Guide doesn't tell this)
1158     if (la < 0) la = -la;
1159     if (lb < 0) lb = -lb;
1160     if (la > 16) la = 16;
1161     if (lb > 16) lb = 16;
1162 
1163     static if (imm8 & 1)
1164     {
1165         __m128i aValid = validMask16e(la);
1166         __m128i bValid = validMask16e(lb);
1167     }
1168     else
1169     {
1170         __m128i aValid = validMask8e(la);
1171         __m128i bValid = validMask8e(lb);
1172     }
1173     return cmpstrMask!imm8(a, aValid, b, bValid);
1174 }
1175 
1176 //ditto
1177 __m128i cmpstrMask(int imm8)(__m128i a, 
1178                              __m128i aValid, 
1179                              __m128i b, 
1180                              const __m128i bValid) @safe
1181 {
1182     enum bool chars16Bits = imm8 & 1;
1183     enum int Mode = (imm8 >> 2) & 3;
1184 
1185     static if (Mode == 0) // equal any
1186     {
1187         __m128i R = _mm_setzero_si128();
1188         static if (chars16Bits) // 64 comparisons
1189         {
1190             for (int k = 0; k < 8; ++k)
1191             {
1192                 __m128i equalMask = _mm_cmpeq_epi16(a, b);
1193                 equalMask = _mm_and_si128(equalMask, aValid);
1194                 R = _mm_or_si128(R, equalMask);
1195 
1196                 // rotate a and aValid
1197                 a = _mm_or_si128(_mm_srli_si128!2(a), _mm_slli_si128!14(a));
1198                 aValid = _mm_or_si128(_mm_srli_si128!2(aValid), _mm_slli_si128!14(aValid));
1199             }
1200         }
1201         else
1202         {
1203             for (int k = 0; k < 16; ++k)
1204             {
1205                 __m128i equalMask = _mm_cmpeq_epi8(a, b);
1206                 equalMask = _mm_and_si128(equalMask, aValid);
1207                 R = _mm_or_si128(R, equalMask);
1208 
1209                 // rotate a and aValid
1210                 a = _mm_or_si128(_mm_srli_si128!1(a), _mm_slli_si128!15(a));
1211                 aValid = _mm_or_si128(_mm_srli_si128!1(aValid), _mm_slli_si128!15(aValid));
1212             }
1213         }
1214         R = _mm_and_si128(R, bValid);
1215     }
1216     else static if (Mode == 1) // ranges
1217     {
1218         enum bool signed = (imm8 & 2) != 0;
1219 
1220         // For each character in b, the returned mask says if it was found in a range-pair in `a`.
1221         __m128i R = _mm_setzero_si128();
1222         static if (chars16Bits)
1223         {
1224             for (int pos = 0; pos < 8; pos += 2)
1225             {
1226                 short min = (cast(short8)a).array[pos];
1227                 short max = (cast(short8)a).array[pos+1];
1228                 static if (signed)
1229                 {
1230                     __m128i ge = ~_mm_cmplt_epi16(b, _mm_set1_epi16(min));
1231                     __m128i le = ~_mm_cmpgt_epi16(b, _mm_set1_epi16(max));
1232                 }
1233                 else
1234                 {
1235                     // No SSE way to do 16-bit unsigned comparisons, 
1236                     // but flipping the sign bit let us used signed comp
1237                     __m128i firstBits = _mm_set1_epi16(-32768);
1238                     __m128i reverseB = _mm_xor_si128(b, firstBits);
1239                     __m128i reverseMin = _mm_xor_si128(_mm_set1_epi16(min), firstBits);
1240                     __m128i reverseMax = _mm_xor_si128(_mm_set1_epi16(max), firstBits);
1241                     __m128i ge = ~_mm_cmplt_epi16(reverseB, reverseMin);
1242                     __m128i le = ~_mm_cmpgt_epi16(reverseB, reverseMax);
1243                 }
1244                 __m128i inRange = _mm_and_si128(le, ge);
1245 
1246                 // Not considered in range a is invalid here.
1247                 short aValidHere = (cast(short8)aValid).array[pos+1];
1248                 __m128i mmAValidHere = _mm_set1_epi16(aValidHere);
1249                 inRange = _mm_and_si128(inRange, mmAValidHere); 
1250 
1251                 R = _mm_or_si128(R, inRange);
1252             }            
1253         }
1254         else // 8-bits
1255         {
1256             for (int pos = 0; pos < 16; pos += 2)
1257             {
1258                 byte min = (cast(byte16)a).array[pos];
1259                 byte max = (cast(byte16)a).array[pos+1];
1260                 static if (signed)
1261                 {
1262                     __m128i ge = _mm_xor_si128(_mm_cmplt_epi8(b, _mm_set1_epi8(min)));
1263                     __m128i le = _mm_xor_si128(_mm_cmpgt_epi8(b, _mm_set1_epi8(max)));
1264                 }
1265                 else
1266                 {
1267                     // No SSE way to do 16-bit unsigned comparisons, 
1268                     // but flipping the sign bit let us used signed comp
1269                     __m128i firstBits = _mm_set1_epi8(-128);
1270                     __m128i reverseB = _mm_xor_si128(b, firstBits);
1271                     __m128i reverseMin = _mm_xor_si128(_mm_set1_epi8(min), firstBits);
1272                     __m128i reverseMax = _mm_xor_si128(_mm_set1_epi8(max), firstBits);
1273                     __m128i ge = ~_mm_cmplt_epi8(reverseB, reverseMin);
1274                     __m128i le = ~_mm_cmpgt_epi8(reverseB, reverseMax);
1275                 }
1276                 __m128i inRange = _mm_and_si128(le, ge);
1277 
1278                 // Not considered in range a is invalid here.
1279                 byte aValidHere = (cast(byte16)aValid).array[pos+1];
1280                 __m128i mmAValidHere = _mm_set1_epi8(aValidHere);
1281                 inRange = _mm_and_si128(inRange, mmAValidHere); 
1282 
1283                 R = _mm_or_si128(R, inRange);
1284             }
1285         }
1286         // invalid b part is not in range
1287         R = _mm_and_si128(R, bValid);
1288     }
1289     else static if (Mode == 2) // equal each, just 16 comparisons not 256
1290     {
1291         static if (chars16Bits)
1292         {
1293             __m128i R = _mm_cmpeq_epi16(a, b);
1294         }
1295         else
1296         {
1297             __m128i R = _mm_cmpeq_epi8(a, b);
1298         }
1299 
1300         // if only a or b is invalid, consider not equal
1301         R = _mm_andnot_si128(_mm_xor_si128(aValid, bValid), R);
1302 
1303         // if a and b are both invalid, consider equal
1304         R = _mm_or_si128(R, ~_mm_or_si128(aValid, bValid));
1305     }  
1306     else static if (Mode == 3) // equal ordered
1307     {
1308         // a is searched in b.
1309 
1310         __m128i bValidShift = bValid;
1311 
1312         __m128i R = _mm_set1_epi32(-1); // all b positions possible for containing a
1313         static if (chars16Bits)
1314         {
1315             for (int pos = 0; pos < 8; ++pos)
1316             {
1317                 // compare character k of a, where can it go in b?
1318                 short charK = (cast(short8)a).array[pos];
1319                 __m128i mmcharK = _mm_set1_epi16(charK);
1320 
1321                 short aValidHere = (cast(short8)aValid).array[pos];
1322                 __m128i mmAValidHere = _mm_set1_epi16(aValidHere);
1323                 __m128i mmAInvalidHere = _mm_xor_si128(mmAValidHere, _mm_set1_epi32(-1));
1324                 __m128i equalMask = _mm_cmpeq_epi16(mmcharK, b);
1325 
1326                 // Where A is invalid, the comparison always holds "equal"
1327                 equalMask = _mm_or_si128(equalMask, mmAInvalidHere);
1328 
1329                 // Where B is invalid, and A is valid, the comparison is forced to false
1330                 equalMask = _mm_and_si128(equalMask, _mm_or_si128(bValidShift, mmAInvalidHere));
1331 
1332                 R = _mm_and_si128(equalMask);
1333 
1334                 // drop first char of b
1335                 b = _mm_srli_si128!2(b);
1336                 bValidShift = _mm_srli_si128!2(bValidShift);
1337             }
1338         }
1339         else
1340         {
1341             for (int pos = 0; pos < 16; ++pos)
1342             {
1343                 // compare character k of a, where can it go in b?
1344                 byte charK = (cast(byte16)a).array[pos];
1345                 __m128i mmcharK = _mm_set1_epi8(charK);
1346 
1347                 byte aValidHere = (cast(byte16)aValid).array[pos];            
1348                 __m128i mmAValidHere = _mm_set1_epi8(aValidHere);
1349                 __m128i mmAInvalidHere = _mm_xor_si128(mmAValidHere, _mm_set1_epi32(-1));
1350                 __m128i equalMask = _mm_cmpeq_epi8(mmcharK, b);
1351 
1352                 // Where A is invalid, the comparison always holds "equal"
1353                 equalMask = _mm_or_si128(equalMask, mmAInvalidHere);
1354 
1355                 // Where B is invalid, and A is valid, the comparison is forced to false
1356                 equalMask = _mm_and_si128(equalMask, _mm_or_si128(bValidShift, mmAInvalidHere));
1357 
1358                 R = _mm_and_si128(R, equalMask);
1359 
1360                 // drop first char of b
1361                 b = _mm_srli_si128!1(b);
1362                 bValidShift = _mm_srli_si128!1(bValidShift);
1363             }
1364         }
1365     }
1366     else 
1367         static assert(0);
1368 
1369     // Optionally negate result
1370     static if (imm8 & _SIDD_NEGATIVE_POLARITY)
1371     {
1372         static if (imm8 & _SIDD_MASKED_POSITIVE_POLARITY) 
1373         {
1374             R = _mm_xor_si128(R, bValid); // only negate valid b
1375         }
1376         else
1377         {
1378             R = _mm_xor_si128(R, _mm_set1_epi32(-1)); // negate all
1379         }
1380     }
1381     return R;
1382 }