1 /**
2 * SSSE3 intrinsics.
3 * https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#techs=SSSE3
4 *
5 * Copyright: Guillaume Piolat 2021.
6 *            Johan Engelen 2021.
7 * License:   $(LINK2 http://www.boost.org/LICENSE_1_0.txt, Boost License 1.0)
8 */
9 module inteli.tmmintrin;
10 
11 public import inteli.types;
12 import inteli.internals;
13 
14 public import inteli.pmmintrin;
15 import inteli.mmx;
16 
17 nothrow @nogc:
18 
19 
20 // SSSE3 instructions
21 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#techs=SSSE3
22 // Note: this header will work whether you have SSSE3 enabled or not.
23 // With LDC, use "dflags-ldc": ["-mattr=+ssse3"] or equivalent to actively 
24 // generate SSE3 instructions.
25 
26 /// Compute the absolute value of packed signed 16-bit integers in `a`.
27 __m128i _mm_abs_epi16 (__m128i a) @trusted
28 {
29     static if (DMD_with_DSIMD)
30     {
31         return cast(__m128i)__simd(XMM.PABSW, a);
32     }
33     else static if (GDC_with_SSSE3)
34     {
35         return cast(__m128i) __builtin_ia32_pabsw128(cast(short8)a);
36     }
37     else static if (LDC_with_ARM64)
38     {
39         return cast(__m128i) vabsq_s16(cast(short8)a);
40     }
41     else
42     {
43         // LDC x86: generate pabsw since LDC 1.1 -O2
44         short8 sa = cast(short8)a;
45         for (int i = 0; i < 8; ++i)
46         {
47             short s = sa.array[i];
48             sa.ptr[i] = s >= 0 ? s : cast(short)(-cast(int)(s));
49         }  
50         return cast(__m128i)sa;
51     }
52 }
53 unittest
54 {
55     __m128i A = _mm_setr_epi16(0, -1, -32768, 32767, 10, -10, 1000, -1000);
56     short8 B = cast(short8) _mm_abs_epi16(A);
57     short[8] correct = [0, 1, -32768, 32767, 10, 10, 1000, 1000];
58     assert(B.array == correct);
59 }
60 
61 /// Compute the absolute value of packed signed 32-bit integers in `a`.
62 __m128i _mm_abs_epi32 (__m128i a) @trusted
63 {
64     static if (DMD_with_DSIMD)
65     {
66         return cast(__m128i)__simd(XMM.PABSD, cast(int4)a);
67     }
68     else static if (GDC_with_SSSE3)
69     {
70         return cast(__m128i) __builtin_ia32_pabsd128(cast(int4)a);
71     }
72     else static if (LDC_with_ARM64)
73     {
74         return cast(__m128i) vabsq_s32(cast(int4)a);
75     }
76     else
77     {
78         // LDC x86: generates pabsd since LDC 1.1 -O2
79         int4 sa = cast(int4)a;
80         for (int i = 0; i < 4; ++i)
81         {
82             int s = sa.array[i];
83             sa.ptr[i] = s >= 0 ? s : -s;
84         }  
85         return cast(__m128i)sa;
86     } 
87 }
88 unittest
89 {
90     __m128i A = _mm_setr_epi32(0, -1, -2_147_483_648, -2_147_483_647);
91     int4 B = cast(int4) _mm_abs_epi32(A);
92     int[4] correct = [0, 1, -2_147_483_648, 2_147_483_647];
93     assert(B.array == correct);
94 }
95 
96 /// Compute the absolute value of packed signed 8-bit integers in `a`.
97 __m128i _mm_abs_epi8 (__m128i a) @trusted
98 {
99     static if (DMD_with_DSIMD)
100     {
101         return cast(__m128i)__simd(XMM.PABSB, cast(byte16)a);
102     }
103     else static if (GDC_with_SSSE3)
104     {
105         alias ubyte16 = __vector(ubyte[16]);
106         return cast(__m128i) __builtin_ia32_pabsb128(cast(ubyte16)a);
107     }
108     else static if (LDC_with_ARM64)
109     {
110         return cast(__m128i) vabsq_s8(cast(byte16)a);
111     }
112     else version(LDC)
113     {
114         // LDC x86: generates pabsb since LDC 1.1 -O1
115         //     arm64: generates abs since LDC 1.8 -O1
116         enum ir = `
117                 %n = sub <16 x i8> <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, %0
118                 %s = icmp slt <16 x i8> <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, %0
119                 %r = select <16 x i1> %s, <16 x i8> %0, <16 x i8> %n
120                 ret <16 x i8> %r`;
121         return cast(__m128i) LDCInlineIR!(ir, byte16, byte16)(cast(byte16)a);
122     }
123     else
124     {
125         // A loop version like in _mm_abs_epi16/_mm_abs_epi32 would be very slow 
126         // in LDC x86 and wouldn't vectorize. Doesn't generate pabsb in LDC though.
127         return _mm_min_epu8(a, _mm_sub_epi8(_mm_setzero_si128(), a));
128     }
129 }
130 unittest
131 {
132     __m128i A = _mm_setr_epi8(0, -1, -128, -127, 127, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
133     byte16 B = cast(byte16) _mm_abs_epi8(A);
134     byte[16] correct =       [0,  1, -128,  127, 127, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0];
135     assert(B.array == correct);
136 }
137 
138 /// Compute the absolute value of packed 64-bit floating-point elements in `a`.
139 /// #BONUS.
140 __m128d _mm_abs_pd (__m128d a) @trusted
141 {
142     long2 mask = 0x7fff_ffff_ffff_ffff;
143     return cast(__m128d)((cast(long2)a) & mask);
144 }
145 unittest
146 {
147     __m128d A = _mm_setr_pd(-42.0f, -double.infinity);
148     __m128d R = _mm_abs_pd(A);
149     double[2] correct =    [42.0f, +double.infinity];
150     assert(R.array == correct);
151 }
152 
153 /// Compute the absolute value of packed signed 16-bit integers in `a`.
154 __m64 _mm_abs_pi16 (__m64 a) @trusted
155 {
156     return to_m64(_mm_abs_epi16(to_m128i(a)));
157 }
158 unittest
159 {
160     __m64 A = _mm_setr_pi16(0, -1, -32768, 32767);
161     short4 B = cast(short4) _mm_abs_pi16(A);
162     short[4] correct = [0, 1, -32768, 32767];
163     assert(B.array == correct);
164 }
165 
166 /// Compute the absolute value of packed signed 32-bit integers in `a`.
167 __m64 _mm_abs_pi32 (__m64 a) @trusted
168 {
169      return to_m64(_mm_abs_epi32(to_m128i(a)));
170 }
171 unittest
172 {
173     __m64 A = _mm_setr_pi32(-1, -2_147_483_648);
174     int2 B = cast(int2) _mm_abs_pi32(A);
175     int[2] correct = [1, -2_147_483_648];
176     assert(B.array == correct);
177 }
178 
179 /// Compute the absolute value of packed signed 8-bit integers in `a`.
180 __m64 _mm_abs_pi8 (__m64 a) @trusted
181 {
182     return to_m64(_mm_abs_epi8(to_m128i(a)));
183 }
184 unittest
185 {
186     __m64 A = _mm_setr_pi8(0, -1, -128, -127, 127, 0, 0, 0);
187     byte8 B = cast(byte8) _mm_abs_pi8(A);
188     byte[8] correct =       [0,  1, -128,  127, 127, 0, 0, 0];
189     assert(B.array == correct);
190 }
191 
192 /// Compute the absolute value of packed 32-bit floating-point elements in `a`.
193 /// #BONUS.
194 __m128 _mm_abs_ps (__m128 a) @trusted
195 {
196     __m128i mask = 0x7fffffff;
197     return cast(__m128)((cast(__m128i)a) & mask);
198 }
199 unittest
200 {
201     __m128 A = _mm_setr_ps(-0.0f, 10.0f, -42.0f, -float.infinity);
202     __m128 R = _mm_abs_ps(A);
203     float[4] correct =    [0.0f, 10.0f, 42.0f, +float.infinity];
204     assert(R.array == correct);
205 }
206 
207 /// Concatenate 16-byte blocks in `a` and `b` into a 32-byte temporary result, shift the result right by `count` bytes, and return the low 16 bytes.
208 __m128i _mm_alignr_epi8(ubyte count)(__m128i a, __m128i b) @trusted
209 {
210     // PERF DMD
211     static if (GDC_with_SSSE3)
212     {
213         return cast(__m128i)__builtin_ia32_palignr128(cast(long2)a, cast(long2)b, count * 8);
214     }
215     else version(LDC)
216     {
217         static if (count >= 32)
218         {
219             return _mm_setzero_si128();
220         }
221         else static if (count < 16)
222         {
223             // Generates palignr since LDC 1.1 -O1
224             // Also generates a single ext instruction on arm64.
225             return cast(__m128i) shufflevector!(byte16, ( 0 + count),
226                                                         ( 1 + count),
227                                                         ( 2 + count),
228                                                         ( 3 + count),
229                                                         ( 4 + count),
230                                                         ( 5 + count),
231                                                         ( 6 + count),
232                                                         ( 7 + count),
233                                                         ( 8 + count),
234                                                         ( 9 + count),
235                                                         (10 + count),
236                                                         (11 + count),
237                                                         (12 + count),
238                                                         (13 + count),
239                                                         (14 + count),
240                                                         (15 + count))(cast(byte16)b, cast(byte16)a);
241         }
242         else
243         {
244             return cast(__m128i) shufflevector!(byte16, ( 0 + count) % 32,
245                                                         ( 1 + count) % 32,
246                                                         ( 2 + count) % 32,
247                                                         ( 3 + count) % 32,
248                                                         ( 4 + count) % 32,
249                                                         ( 5 + count) % 32,
250                                                         ( 6 + count) % 32,
251                                                         ( 7 + count) % 32,
252                                                         ( 8 + count) % 32,
253                                                         ( 9 + count) % 32,
254                                                         (10 + count) % 32,
255                                                         (11 + count) % 32,
256                                                         (12 + count) % 32,
257                                                         (13 + count) % 32,
258                                                         (14 + count) % 32,
259                                                         (15 + count) % 32)(cast(byte16)_mm_setzero_si128(), cast(byte16)a);
260         }
261     }
262     else
263     {
264         byte16 ab = cast(byte16)a;
265         byte16 bb = cast(byte16)b;
266         byte16 r;
267 
268         for (int i = 0; i < 16; ++i)
269         {
270             const int srcpos = count + cast(int)i;
271             if (srcpos > 31) 
272             {
273                 r.ptr[i] = 0;
274             } 
275             else if (srcpos > 15) 
276             {
277                 r.ptr[i] = ab.array[(srcpos) & 15];
278             } 
279             else 
280             {
281                 r.ptr[i] = bb.array[srcpos];
282             }
283        }
284        return cast(__m128i)r;
285     }
286 }
287 unittest
288 {
289     __m128i A = _mm_setr_epi8(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
290     __m128i B = _mm_setr_epi8(17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32);
291 
292     {
293         byte16 C = cast(byte16)_mm_alignr_epi8!0(A ,B);
294         byte[16] correct = [17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32];
295         assert(C.array == correct);
296     }
297     {
298         byte16 C = cast(byte16)_mm_alignr_epi8!20(A ,B);
299         byte[16] correct = [5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 0, 0, 0, 0];
300         assert(C.array == correct);
301     }
302     {
303         byte16 C = cast(byte16)_mm_alignr_epi8!34(A ,B);
304         byte[16] correct = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0];
305         assert(C.array == correct);
306     }
307 
308     __m128i D = _mm_setr_epi8(-123, -82, 103, -69, 103, -26, 9, 106, 58, -11, 79, -91, 114, -13, 110, 60);
309     __m128i E = _mm_setr_epi8(25, -51, -32, 91, -85, -39, -125, 31, -116, 104, 5, -101, 127, 82, 14, 81);
310     byte16 F = cast(byte16)_mm_alignr_epi8!8(D, E);
311     byte[16] correct = [-116, 104, 5, -101, 127, 82, 14, 81, -123, -82, 103, -69, 103, -26, 9, 106];
312     assert(F.array == correct);
313 }
314 
315 /// Concatenate 8-byte blocks in `a` and `b` into a 16-byte temporary result, shift the result right by `count` bytes, and return the low 8 bytes.
316 __m64 _mm_alignr_pi8(ubyte count)(__m64 a, __m64 b) @trusted
317 {
318     // PERF DMD
319     static if (GDC_with_SSSE3)
320     {
321         return cast(__m64)__builtin_ia32_palignr(cast(long1)a, cast(long1)b, count * 8);
322     }
323     else version(LDC)
324     {
325         static if (count >= 16)
326         {
327             return _mm_setzero_si64();
328         }
329         else static if (count < 8)
330         {
331             // Note: in LDC x86 this uses a pshufb.
332             // Generates ext in arm64.
333             return cast(__m64) shufflevector!(byte8, (0 + count),
334                                                      (1 + count),
335                                                      (2 + count),
336                                                      (3 + count),
337                                                      (4 + count),
338                                                      (5 + count),
339                                                      (6 + count),
340                                                      (7 + count))(cast(byte8)b, cast(byte8)a);
341         }
342         else
343         {
344             return cast(__m64) shufflevector!(byte8, (0 + count)%16,
345                                                      (1 + count)%16,
346                                                      (2 + count)%16,
347                                                      (3 + count)%16,
348                                                      (4 + count)%16,
349                                                      (5 + count)%16,
350                                                      (6 + count)%16,
351                                                      (7 + count)%16)(cast(byte8)_mm_setzero_si64(), cast(byte8)a);
352         }
353     }
354     else
355     {
356         byte8 ab = cast(byte8)a;
357         byte8 bb = cast(byte8)b;
358         byte8 r;
359 
360         for (int i = 0; i < 8; ++i)
361         {
362             const int srcpos = count + cast(int)i;
363             if (srcpos > 15) 
364             {
365                 r.ptr[i] = 0;
366             } 
367             else if (srcpos > 7) 
368             {
369                 r.ptr[i] = ab.array[(srcpos) & 7];
370             } 
371             else 
372             {
373                 r.ptr[i] = bb.array[srcpos];
374             }
375        }
376        return cast(__m64)r;
377     }
378 }
379 unittest
380 {
381     __m64 A = _mm_setr_pi8(1, 2, 3, 4, 5, 6, 7, 8);
382     __m64 B = _mm_setr_pi8(17, 18, 19, 20, 21, 22, 23, 24);
383 
384     {
385         byte8 C = cast(byte8)_mm_alignr_pi8!0(A ,B);
386         byte[8] correct = [17, 18, 19, 20, 21, 22, 23, 24];
387         assert(C.array == correct);
388     }
389 
390     {
391         byte8 C = cast(byte8)_mm_alignr_pi8!3(A ,B);
392         byte[8] correct = [ 20, 21, 22, 23, 24, 1, 2, 3];
393         assert(C.array == correct);
394     }
395     {
396         byte8 C = cast(byte8)_mm_alignr_pi8!11(A ,B);
397         byte[8] correct = [4, 5, 6, 7, 8, 0, 0, 0];
398         assert(C.array == correct);
399     }
400     {
401         byte8 C = cast(byte8)_mm_alignr_pi8!17(A ,B);
402         byte[8] correct = [0, 0, 0, 0, 0, 0, 0, 0];
403         assert(C.array == correct);
404     }
405 }
406 
407 /// Horizontally add adjacent pairs of 16-bit integers in `a` and `b`, and pack the signed 16-bit results.
408 __m128i _mm_hadd_epi16 (__m128i a, __m128i b) @trusted
409 {
410     // PERF DMD
411     static if (GDC_with_SSSE3)
412     {
413         return cast(__m128i)__builtin_ia32_phaddw128(cast(short8)a, cast(short8)b);
414     }
415     else static if (LDC_with_SSSE3)
416     {
417         return cast(__m128i)__builtin_ia32_phaddw128(cast(short8)a, cast(short8)b);
418     }
419     else static if (LDC_with_ARM64)
420     {
421         return cast(__m128i)vpaddq_s16(cast(short8)a, cast(short8)b);
422     }
423     else
424     {
425         short8 sa = cast(short8)a;
426         short8 sb = cast(short8)b;
427         short8 r;
428         r.ptr[0] = cast(short)(sa.array[0] + sa.array[1]);
429         r.ptr[1] = cast(short)(sa.array[2] + sa.array[3]);
430         r.ptr[2] = cast(short)(sa.array[4] + sa.array[5]);
431         r.ptr[3] = cast(short)(sa.array[6] + sa.array[7]);
432         r.ptr[4] = cast(short)(sb.array[0] + sb.array[1]);
433         r.ptr[5] = cast(short)(sb.array[2] + sb.array[3]);
434         r.ptr[6] = cast(short)(sb.array[4] + sb.array[5]);
435         r.ptr[7] = cast(short)(sb.array[6] + sb.array[7]);
436         return cast(__m128i)r;
437     }
438 }
439 unittest
440 {
441     __m128i A = _mm_setr_epi16(1, -2, 4, 8, 16, 32, -1, -32768);
442     short8 C = cast(short8) _mm_hadd_epi16(A, A);
443     short[8] correct = [ -1, 12, 48, 32767, -1, 12, 48, 32767];
444     assert(C.array == correct);
445 }
446 
447 /// Horizontally add adjacent pairs of 32-bit integers in `a` and `b`, and pack the signed 32-bit results.
448 __m128i _mm_hadd_epi32 (__m128i a, __m128i b) @trusted
449 { 
450     // PERF DMD
451     static if (GDC_with_SSSE3)
452     {
453         return cast(__m128i)__builtin_ia32_phaddd128(cast(int4)a, cast(int4)b);
454     }
455     else static if (LDC_with_SSSE3)
456     {
457         return cast(__m128i)__builtin_ia32_phaddd128(cast(int4)a, cast(int4)b);
458     }
459     else static if (LDC_with_ARM64)
460     {
461         return cast(__m128i)vpaddq_s32(cast(int4)a, cast(int4)b);
462     }
463     else
464     {
465         int4 ia = cast(int4)a;
466         int4 ib = cast(int4)b;
467         int4 r;
468         r.ptr[0] = ia.array[0] + ia.array[1];
469         r.ptr[1] = ia.array[2] + ia.array[3];
470         r.ptr[2] = ib.array[0] + ib.array[1];
471         r.ptr[3] = ib.array[2] + ib.array[3];
472         return cast(__m128i)r;
473     }
474 }
475 unittest
476 {
477     __m128i A = _mm_setr_epi32(1, -2, int.min, -1);
478     __m128i B = _mm_setr_epi32(1, int.max, 4, -4);
479     int4 C = cast(int4) _mm_hadd_epi32(A, B);
480     int[4] correct = [ -1, int.max, int.min, 0 ];
481     assert(C.array == correct);
482 }
483 
484 /// Horizontally add adjacent pairs of 16-bit integers in `a` and `b`, and pack the signed 16-bit results.
485 __m64 _mm_hadd_pi16 (__m64 a, __m64 b) @trusted
486 {
487     // PERF DMD
488     static if (GDC_with_SSSE3)
489     {
490         return cast(__m64) __builtin_ia32_phaddw(cast(short4)a, cast(short4)b);
491     }
492     else static if (LDC_with_ARM64)
493     {
494         return cast(__m64) vpadd_s16(cast(short4)a, cast(short4)b);
495     }
496     else
497     {
498         // LDC x86: generates phaddw since LDC 1.24 -O2.
499         short4 r;
500         short4 sa = cast(short4)a;
501         short4 sb = cast(short4)b;
502         r.ptr[0] = cast(short)(sa.array[0] + sa.array[1]); 
503         r.ptr[1] = cast(short)(sa.array[2] + sa.array[3]);
504         r.ptr[2] = cast(short)(sb.array[0] + sb.array[1]);
505         r.ptr[3] = cast(short)(sb.array[2] + sb.array[3]);
506         return cast(__m64)r;
507     }
508 }
509 unittest
510 {
511     __m64 A = _mm_setr_pi16(1, -2, 4, 8);
512     __m64 B = _mm_setr_pi16(16, 32, -1, -32768);
513     short4 C = cast(short4) _mm_hadd_pi16(A, B);
514     short[4] correct = [ -1, 12, 48, 32767 ];
515     assert(C.array == correct);
516 }
517 
518 /// Horizontally add adjacent pairs of 32-bit integers in `a` and `b`, 
519 /// and pack the signed 32-bit results.
520 __m64 _mm_hadd_pi32 (__m64 a, __m64 b) @trusted
521 {
522     // PERF DMD
523     static if (GDC_with_SSSE3)
524     {
525         return cast(__m64) __builtin_ia32_phaddd(cast(int2)a, cast(int2)b);
526     }
527     else static if (LDC_with_ARM64)
528     {
529         return cast(__m64)vpadd_s32(cast(int2)a, cast(int2)b);
530     }
531     else
532     {
533         // LDC x86: generates phaddd since LDC 1.24 -O2
534         int2 ia = cast(int2)a;
535         int2 ib = cast(int2)b;
536         int2 r;
537         r.ptr[0] = ia.array[0] + ia.array[1];
538         r.ptr[1] = ib.array[0] + ib.array[1];
539         return cast(__m64)r;
540     }
541 }
542 unittest
543 {
544     __m64 A = _mm_setr_pi32(int.min, -1);
545     __m64 B = _mm_setr_pi32(1, int.max);
546     int2 C = cast(int2) _mm_hadd_pi32(A, B);
547     int[2] correct = [ int.max, int.min ];
548     assert(C.array == correct);
549 }
550 
551 /// Horizontally add adjacent pairs of signed 16-bit integers in `a` and `b` using saturation, 
552 /// and pack the signed 16-bit results.
553 __m128i _mm_hadds_epi16 (__m128i a, __m128i b) @trusted
554 {
555      // PERF DMD
556     static if (GDC_with_SSSE3)
557     {
558         return cast(__m128i)__builtin_ia32_phaddsw128(cast(short8)a, cast(short8)b);
559     }
560     else static if (LDC_with_SSSE3)
561     {
562         return cast(__m128i)__builtin_ia32_phaddsw128(cast(short8)a, cast(short8)b);
563     }
564     else static if (LDC_with_ARM64)
565     {
566         // uzp1/uzp2/sqadd sequence
567         short8 sa = cast(short8)a;
568         short8 sb = cast(short8)b;
569         short8 c = shufflevector!(short8, 0, 2, 4, 6, 8, 10, 12, 14)(sa, sb);
570         short8 d = shufflevector!(short8, 1, 3, 5, 7, 9, 11, 13, 15)(sa, sb);
571         return cast(__m128i)vqaddq_s16(c, d);
572     }
573     else
574     {
575         short8 sa = cast(short8)a;
576         short8 sb = cast(short8)b;
577         short8 r;
578         r.ptr[0] = saturateSignedIntToSignedShort(sa.array[0] + sa.array[1]);
579         r.ptr[1] = saturateSignedIntToSignedShort(sa.array[2] + sa.array[3]);
580         r.ptr[2] = saturateSignedIntToSignedShort(sa.array[4] + sa.array[5]);
581         r.ptr[3] = saturateSignedIntToSignedShort(sa.array[6] + sa.array[7]);
582         r.ptr[4] = saturateSignedIntToSignedShort(sb.array[0] + sb.array[1]);
583         r.ptr[5] = saturateSignedIntToSignedShort(sb.array[2] + sb.array[3]);
584         r.ptr[6] = saturateSignedIntToSignedShort(sb.array[4] + sb.array[5]);
585         r.ptr[7] = saturateSignedIntToSignedShort(sb.array[6] + sb.array[7]);
586         return cast(__m128i)r;
587     }
588 }
589 unittest
590 {
591     __m128i A = _mm_setr_epi16(1, -2, 4, 8, 16, 32, -1, -32768);
592     short8 C = cast(short8) _mm_hadds_epi16(A, A);
593     short[8] correct = [ -1, 12, 48, -32768, -1, 12, 48, -32768];
594     assert(C.array == correct);
595 }
596 
597 /// Horizontally add adjacent pairs of signed 16-bit integers in `a` and `b` using saturation, 
598 /// and pack the signed 16-bit results.
599 __m64 _mm_hadds_pi16 (__m64 a, __m64 b) @trusted
600 {
601     static if (GDC_with_SSSE3)
602     {
603         return cast(__m64)__builtin_ia32_phaddsw(cast(short4)a, cast(short4)b);
604     }
605     else static if (LDC_with_SSSE3)
606     {
607         // Note: LDC doesn't have __builtin_ia32_phaddsw
608         long2 la;
609         la.ptr[0] = a.array[0];
610         long2 lb;
611         lb.ptr[0] = b.array[0];
612         int4 sum = cast(int4)__builtin_ia32_phaddsw128(cast(short8)la, cast(short8)lb);
613         int2 r;
614         r.ptr[0] = sum.array[0];
615         r.ptr[1] = sum.array[2];
616         return cast(__m64)r;
617     }
618     else static if (LDC_with_ARM64)
619     {
620         // uzp1/uzp2/sqadd sequence
621         short4 sa = cast(short4)a;
622         short4 sb = cast(short4)b;
623         short4 c = shufflevector!(short4, 0, 2, 4, 6)(sa, sb);
624         short4 d = shufflevector!(short4, 1, 3, 5, 7)(sa, sb);
625         return cast(__m64)vqadd_s16(c, d);
626     }
627     else
628     {
629         short4 sa = cast(short4)a;
630         short4 sb = cast(short4)b;
631         short4 r;
632         r.ptr[0] = saturateSignedIntToSignedShort(sa.array[0] + sa.array[1]);
633         r.ptr[1] = saturateSignedIntToSignedShort(sa.array[2] + sa.array[3]);
634         r.ptr[2] = saturateSignedIntToSignedShort(sb.array[0] + sb.array[1]);
635         r.ptr[3] = saturateSignedIntToSignedShort(sb.array[2] + sb.array[3]);
636         return cast(__m64)r;
637     }
638 }
639 unittest
640 {
641     __m64 A = _mm_setr_pi16(-16, 32, -100, -32768);
642     __m64 B = _mm_setr_pi16( 64, 32,    1,  32767);
643     short4 C = cast(short4) _mm_hadds_pi16(A, B);
644     short[4] correct = [ 16, -32768,  96,  32767];
645     assert(C.array == correct);
646 }
647 
648 
649 /// Horizontally add adjacent pairs of 16-bit integers in `a` and `b`, and pack the signed 16-bit results.
650 __m128i _mm_hsub_epi16 (__m128i a, __m128i b) @trusted
651 {
652     // PERF DMD
653     static if (GDC_with_SSSE3)
654     {
655         return cast(__m128i)__builtin_ia32_phsubw128(cast(short8)a, cast(short8)b);
656     }
657     else static if (LDC_with_SSSE3)
658     {
659         return cast(__m128i)__builtin_ia32_phsubw128(cast(short8)a, cast(short8)b);
660     }
661     else static if (LDC_with_ARM64)
662     {
663         // Produce uzp1 uzp2 sub sequence since LDC 1.8 -O1 
664         short8 sa = cast(short8)a;
665         short8 sb = cast(short8)b;
666         short8 c = shufflevector!(short8, 0, 2, 4, 6, 8, 10, 12, 14)(sa, sb);
667         short8 d = shufflevector!(short8, 1, 3, 5, 7, 9, 11, 13, 15)(sa, sb);
668         return cast(__m128i)(c - d);
669     }
670     else 
671     {
672         short8 sa = cast(short8)a;
673         short8 sb = cast(short8)b;
674         short8 r;
675         r.ptr[0] = cast(short)(sa.array[0] - sa.array[1]);
676         r.ptr[1] = cast(short)(sa.array[2] - sa.array[3]);
677         r.ptr[2] = cast(short)(sa.array[4] - sa.array[5]);
678         r.ptr[3] = cast(short)(sa.array[6] - sa.array[7]);
679         r.ptr[4] = cast(short)(sb.array[0] - sb.array[1]);
680         r.ptr[5] = cast(short)(sb.array[2] - sb.array[3]);
681         r.ptr[6] = cast(short)(sb.array[4] - sb.array[5]);
682         r.ptr[7] = cast(short)(sb.array[6] - sb.array[7]);
683         return cast(__m128i)r;
684     }
685 }
686 unittest
687 {
688     __m128i A = _mm_setr_epi16(short.min, 1, 4, 8, 16, 32, 1, -32768);
689     short8 C = cast(short8) _mm_hsub_epi16(A, A);
690     short[8] correct = [ short.max, -4, -16, -32767, short.max, -4, -16, -32767];
691     assert(C.array == correct);
692 }
693 
694 /// Horizontally add adjacent pairs of 32-bit integers in `a` and `b`, and pack the signed 32-bit results.
695 __m128i _mm_hsub_epi32 (__m128i a, __m128i b) @trusted
696 { 
697     // PERF DMD
698     static if (GDC_with_SSSE3)
699     {
700         return cast(__m128i)__builtin_ia32_phsubd128(cast(int4)a, cast(int4)b);
701     }
702     else static if (LDC_with_SSSE3)
703     {
704         return cast(__m128i)__builtin_ia32_phsubd128(cast(int4)a, cast(int4)b);
705     }
706     else static if (LDC_with_ARM64)
707     {
708         // Produce uzp1 uzp2 sub sequence since LDC 1.8 -O1 
709         int4 ia = cast(int4)a;
710         int4 ib = cast(int4)b;
711         int4 c = shufflevector!(int4, 0, 2, 4, 6)(ia, ib);
712         int4 d = shufflevector!(int4, 1, 3, 5, 7)(ia, ib);
713         return cast(__m128i)(c - d);
714     }
715     else
716     {
717         int4 ia = cast(int4)a;
718         int4 ib = cast(int4)b;
719         int4 r;
720         r.ptr[0] = ia.array[0] - ia.array[1];
721         r.ptr[1] = ia.array[2] - ia.array[3];
722         r.ptr[2] = ib.array[0] - ib.array[1];
723         r.ptr[3] = ib.array[2] - ib.array[3];
724         return cast(__m128i)r;
725     }
726 }
727 unittest
728 {
729     __m128i A = _mm_setr_epi32(1, 2, int.min, 1);
730     __m128i B = _mm_setr_epi32(int.max, -1, 4, 4);
731     int4 C = cast(int4) _mm_hsub_epi32(A, B);
732     int[4] correct = [ -1, int.max, int.min, 0 ];
733     assert(C.array == correct);
734 }
735 
736 /// Horizontally subtract adjacent pairs of 16-bit integers in `a` and `b`, 
737 /// and pack the signed 16-bit results.
738 __m64 _mm_hsub_pi16 (__m64 a, __m64 b) @trusted
739 {
740     // PERF DMD
741     static if (GDC_with_SSSE3)
742     {
743         return cast(__m64)__builtin_ia32_phsubw(cast(short4)a, cast(short4)b);
744     }
745     else static if (LDC_with_ARM64)
746     {
747         // Produce uzp1 uzp2 sub sequence since LDC 1.3 -O1 
748         short4 sa = cast(short4)a;
749         short4 sb = cast(short4)b;
750         short4 c = shufflevector!(short4, 0, 2, 4, 6)(sa, sb);
751         short4 d = shufflevector!(short4, 1, 3, 5, 7)(sa, sb);
752         return cast(__m64)(c - d);
753     }
754     else
755     {
756         // LDC x86: generates phsubw since LDC 1.24 -O2
757         short4 sa = cast(short4)a;
758         short4 sb = cast(short4)b;
759         short4 r;
760         r.ptr[0] = cast(short)(sa.array[0] - sa.array[1]);
761         r.ptr[1] = cast(short)(sa.array[2] - sa.array[3]);
762         r.ptr[2] = cast(short)(sb.array[0] - sb.array[1]);
763         r.ptr[3] = cast(short)(sb.array[2] - sb.array[3]);
764         return cast(__m64)r;
765     }
766 }
767 unittest
768 {
769     __m64 A = _mm_setr_pi16(short.min, 1, 4, 8);
770     __m64 B = _mm_setr_pi16(16, 32, 1, -32768);
771     short4 C = cast(short4) _mm_hsub_pi16(A, B);
772     short[4] correct = [ short.max, -4, -16, -32767];
773     assert(C.array == correct);
774 }
775 
776 /// Horizontally subtract adjacent pairs of 32-bit integers in `a` and `b`, 
777 /// and pack the signed 32-bit results.
778 __m64 _mm_hsub_pi32 (__m64 a, __m64 b) @trusted
779 {
780     // PERF DMD
781     static if (GDC_with_SSSE3)
782     {
783         return cast(__m64)__builtin_ia32_phsubd(cast(int2)a, cast(int2)b);
784     }
785     else static if (LDC_with_ARM64)
786     {
787         // LDC arm64: generates zip1+zip2+sub sequence since LDC 1.8 -O1
788         int2 ia = cast(int2)a;
789         int2 ib = cast(int2)b;
790         int2 c = shufflevector!(int2, 0, 2)(ia, ib);
791         int2 d = shufflevector!(int2, 1, 3)(ia, ib);
792         return cast(__m64)(c - d);
793     }
794     else
795     {
796         // LDC x86: generates phsubd since LDC 1.24 -O2
797         int2 ia = cast(int2)a;
798         int2 ib = cast(int2)b;
799         int2 r;
800         r.ptr[0] = ia.array[0] - ia.array[1];
801         r.ptr[1] = ib.array[0] - ib.array[1];
802         return cast(__m64)r;
803     }
804 }
805 unittest
806 {
807     __m64 A = _mm_setr_pi32(int.min, 1);
808     __m64 B = _mm_setr_pi32(int.max, -1);
809     int2 C = cast(int2) _mm_hsub_pi32(A, B);
810     int[2] correct = [ int.max, int.min ];
811     assert(C.array == correct);
812 }
813 
814 /// Horizontally subtract adjacent pairs of signed 16-bit integers in `a` and `b` using saturation, 
815 /// and pack the signed 16-bit results.
816 __m128i _mm_hsubs_epi16 (__m128i a, __m128i b) @trusted
817 {
818      // PERF DMD
819     static if (GDC_with_SSSE3)
820     {
821         return cast(__m128i)__builtin_ia32_phsubsw128(cast(short8)a, cast(short8)b);
822     }
823     else static if (LDC_with_SSSE3)
824     {
825         return cast(__m128i)__builtin_ia32_phsubsw128(cast(short8)a, cast(short8)b);
826     }
827     else static if (LDC_with_ARM64)
828     {
829         // uzp1/uzp2/sqsub sequence
830         short8 sa = cast(short8)a;
831         short8 sb = cast(short8)b;
832         short8 c = shufflevector!(short8, 0, 2, 4, 6, 8, 10, 12, 14)(sa, sb);
833         short8 d = shufflevector!(short8, 1, 3, 5, 7, 9, 11, 13, 15)(sa, sb);
834         return cast(__m128i)vqsubq_s16(c, d);
835     }
836     else
837     {
838         short8 sa = cast(short8)a;
839         short8 sb = cast(short8)b;
840         short8 r;
841         r.ptr[0] = saturateSignedIntToSignedShort(sa.array[0] - sa.array[1]);
842         r.ptr[1] = saturateSignedIntToSignedShort(sa.array[2] - sa.array[3]);
843         r.ptr[2] = saturateSignedIntToSignedShort(sa.array[4] - sa.array[5]);
844         r.ptr[3] = saturateSignedIntToSignedShort(sa.array[6] - sa.array[7]);
845         r.ptr[4] = saturateSignedIntToSignedShort(sb.array[0] - sb.array[1]);
846         r.ptr[5] = saturateSignedIntToSignedShort(sb.array[2] - sb.array[3]);
847         r.ptr[6] = saturateSignedIntToSignedShort(sb.array[4] - sb.array[5]);
848         r.ptr[7] = saturateSignedIntToSignedShort(sb.array[6] - sb.array[7]);
849         return cast(__m128i)r;
850     }
851 }
852 unittest
853 {
854     __m128i A = _mm_setr_epi16(1, -2, 4, 8, 32767, -1, -10, 32767);
855     short8 C = cast(short8) _mm_hsubs_epi16(A, A);
856     short[8] correct = [ 3, -4, 32767, -32768, 3, -4, 32767, -32768 ];
857     assert(C.array == correct);
858 }
859 
860 
861 /// Horizontally subtract adjacent pairs of signed 16-bit integers in `a` and `b` using saturation, 
862 /// and pack the signed 16-bit results.
863 __m64 _mm_hsubs_pi16 (__m64 a, __m64 b) @trusted
864 {
865     static if (GDC_with_SSSE3)
866     {
867         return cast(__m64)__builtin_ia32_phsubsw(cast(short4)a, cast(short4)b);
868     }
869     else static if (LDC_with_SSSE3)
870     {
871         // Note: LDC doesn't have __builtin_ia32_phsubsw
872         long2 la;
873         la.ptr[0] = a.array[0];
874         long2 lb;
875         lb.ptr[0] = b.array[0];
876         int4 sum = cast(int4)__builtin_ia32_phsubsw128(cast(short8)la, cast(short8)lb);
877         int2 r;
878         r.ptr[0] = sum.array[0];
879         r.ptr[1] = sum.array[2];
880         return cast(__m64)r;
881     }
882     else static if (LDC_with_ARM64)
883     {
884         // uzp1/uzp2/sqsub sequence in -O1
885         short4 sa = cast(short4)a;
886         short4 sb = cast(short4)b;
887         short4 c = shufflevector!(short4, 0, 2, 4, 6)(sa, sb);
888         short4 d = shufflevector!(short4, 1, 3, 5, 7)(sa, sb);
889         return cast(__m64)vqsub_s16(c, d);
890     }
891     else
892     {
893         short4 sa = cast(short4)a;
894         short4 sb = cast(short4)b;
895         short4 r;
896         r.ptr[0] = saturateSignedIntToSignedShort(sa.array[0] - sa.array[1]);
897         r.ptr[1] = saturateSignedIntToSignedShort(sa.array[2] - sa.array[3]);
898         r.ptr[2] = saturateSignedIntToSignedShort(sb.array[0] - sb.array[1]);
899         r.ptr[3] = saturateSignedIntToSignedShort(sb.array[2] - sb.array[3]);
900         return cast(__m64)r;
901     }
902 }
903 unittest
904 {
905     __m64 A = _mm_setr_pi16(-16, 32, 100, -32768);
906     __m64 B = _mm_setr_pi16( 64, 30,   -9,  32767);
907     short4 C = cast(short4) _mm_hsubs_pi16(A, B);
908     short[4] correct = [ -48, 32767,  34,  -32768];
909     assert(C.array == correct);
910 }
911 
912 
913 /// Vertically multiply each unsigned 8-bit integer from `a` with the corresponding 
914 /// signed 8-bit integer from `b`, producing intermediate signed 16-bit integers. 
915 /// Horizontally add adjacent pairs of intermediate signed 16-bit integers, 
916 /// and pack the saturated results.
917 __m128i _mm_maddubs_epi16 (__m128i a, __m128i b) @trusted
918 {
919     static if (GDC_with_SSSE3)
920     {
921         return cast(__m128i)__builtin_ia32_pmaddubsw128(cast(byte16)a, cast(byte16)b);
922     }
923     else static if (LDC_with_SSSE3)
924     {
925         return cast(__m128i)__builtin_ia32_pmaddubsw128(cast(byte16)a, cast(byte16)b);
926     }
927     else
928     {
929         // zero-extend a to 16-bit
930         __m128i zero = _mm_setzero_si128();
931         __m128i a_lo = _mm_unpacklo_epi8(a, zero);
932         __m128i a_hi = _mm_unpackhi_epi8(a, zero);
933 
934         // sign-extend b to 16-bit
935         __m128i b_lo = _mm_unpacklo_epi8(b, zero);
936         __m128i b_hi = _mm_unpackhi_epi8(b, zero);    
937         b_lo = _mm_srai_epi16( _mm_slli_epi16(b_lo, 8), 8);
938         b_hi = _mm_srai_epi16( _mm_slli_epi16(b_hi, 8), 8); 
939 
940         // Multiply element-wise, no overflow can occur
941         __m128i c_lo = _mm_mullo_epi16(a_lo, b_lo);  
942         __m128i c_hi = _mm_mullo_epi16(a_hi, b_hi);
943 
944         // Add pairwise with saturating horizontal add
945         return _mm_hadds_epi16(c_lo, c_hi);
946     }
947 }
948 unittest
949 {
950     __m128i A = _mm_setr_epi8(  -1,  10, 100, -128, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0); // u8
951     __m128i B = _mm_setr_epi8(-128, -30, 100,  127, -1, 2, 4, 6, 0, 0, 0, 0, 0, 0, 0, 0); // i8
952     short8 C = cast(short8) _mm_maddubs_epi16(A, B);
953     short[8] correct =       [   -32768,     26256, 0, 0, 0, 0, 0, 0];
954     assert(C.array == correct);
955 }
956 
957 /// Vertically multiply each unsigned 8-bit integer from `a` with the corresponding 
958 /// signed 8-bit integer from `b`, producing intermediate signed 16-bit integers. 
959 /// Horizontally add adjacent pairs of intermediate signed 16-bit integers, 
960 /// and pack the saturated results.
961 __m64 _mm_maddubs_pi16 (__m64 a, __m64 b) @trusted
962 {
963     static if (GDC_with_SSSE3)
964     {
965         return cast(__m64)__builtin_ia32_pmaddubsw(cast(byte8)a, cast(byte8)b);
966     }
967     else static if (LDC_with_SSSE3)
968     {
969         __m128i A = to_m128i(a);
970         __m128i B = to_m128i(b);
971         return to_m64( cast(__m128i)__builtin_ia32_pmaddubsw128(cast(byte16) to_m128i(a), cast(byte16) to_m128i(b)));
972     }
973     else
974     {
975         // zero-extend a to 16-bit
976         __m128i zero = _mm_setzero_si128();
977         __m128i A = _mm_unpacklo_epi8(to_m128i(a), zero);
978 
979         // sign-extend b to 16-bit
980         __m128i B = _mm_unpacklo_epi8(to_m128i(b), zero);    
981         B = _mm_srai_epi16( _mm_slli_epi16(B, 8), 8);
982 
983         // Multiply element-wise, no overflow can occur
984         __m128i c = _mm_mullo_epi16(A, B);
985 
986         // Add pairwise with saturating horizontal add
987         return to_m64( _mm_hadds_epi16(c, zero));
988     }
989 }
990 unittest
991 {
992     __m64 A = _mm_setr_pi8(  -1,  10, 100, -128, 0, 0, 0, 0); // u8
993     __m64 B = _mm_setr_pi8(-128, -30, 100,  127, -1, 2, 4, 6); // i8
994     short4 C = cast(short4) _mm_maddubs_pi16(A, B);
995     short[4] correct =       [   -32768,   26256, 0, 0];
996     assert(C.array == correct);
997 }
998 
999 /// Multiply packed signed 16-bit integers in `a` and `b`, producing intermediate signed 32-bit integers.
1000 /// Truncate each intermediate integer to the 18 most significant bits, round by adding 1, and return bits `[16:1]`.
1001 __m128i _mm_mulhrs_epi16 (__m128i a, __m128i b) @trusted
1002 {
1003     // PERF DMD
1004     static if (GDC_with_SSSE3)
1005     {
1006         return cast(__m128i) __builtin_ia32_pmulhrsw128(cast(short8)a, cast(short8)b);
1007     }
1008     else static if (LDC_with_SSSE3)
1009     {
1010         return cast(__m128i) __builtin_ia32_pmulhrsw128(cast(short8)a, cast(short8)b);
1011     }
1012     else static if (LDC_with_ARM64)
1013     {
1014         int4 mul_lo = vmull_s16(vget_low_s16(cast(short8)a),
1015                                 vget_low_s16(cast(short8)b));
1016         int4 mul_hi = vmull_s16(vget_high_s16(cast(short8)a),
1017                                 vget_high_s16(cast(short8)b));
1018 
1019         // Rounding narrowing shift right
1020         // narrow = (int16_t)((mul + 16384) >> 15);
1021         short4 narrow_lo = vrshrn_n_s32(mul_lo, 15);
1022         short4 narrow_hi = vrshrn_n_s32(mul_hi, 15);
1023 
1024         // Join together.
1025         return cast(__m128i) vcombine_s16(narrow_lo, narrow_hi);
1026     }
1027     else
1028     {
1029         short8 sa = cast(short8)a;
1030         short8 sb = cast(short8)b;
1031         short8 r;
1032 
1033         for (int i = 0; i < 8; ++i)
1034         {
1035             // I doubted it at first, but an exhaustive search show this to be equivalent to Intel pseudocode.
1036             r.ptr[i] = cast(short) ( (sa.array[i] * sb.array[i] + 0x4000) >> 15);
1037         }
1038 
1039         return cast(__m128i)r;
1040     }
1041 }
1042 
1043 unittest
1044 {
1045     __m128i A = _mm_setr_epi16(12345, -32768, 32767, 0, 1, 845, -6999, -1);
1046     __m128i B = _mm_setr_epi16(8877, -24487, 15678, 32760, 1, 0, -149, -1);
1047     short8 C = cast(short8) _mm_mulhrs_epi16(A, B);
1048     short[8] correct = [3344, 24487, 15678, 0, 0, 0, 32, 0];
1049     assert(C.array == correct);
1050 }
1051 
1052 /// Multiply packed signed 16-bit integers in `a` and `b`, producing intermediate signed 32-bit integers.
1053 /// Truncate each intermediate integer to the 18 most significant bits, round by adding 1, and return bits `[16:1]`.
1054 __m64 _mm_mulhrs_pi16 (__m64 a, __m64 b) @trusted
1055 {
1056     // PERF DMD
1057     static if (GDC_with_SSSE3)
1058     {
1059         return cast(__m64) __builtin_ia32_pmulhrsw(cast(short4)a, cast(short4)b);
1060     }
1061     else static if (LDC_with_SSSE3)
1062     {
1063         return cast(__m64) to_m64( cast(__m128i) __builtin_ia32_pmulhrsw128(cast(short8) to_m128i(a), cast(short8) to_m128i(b)));
1064     }
1065     else static if (LDC_with_ARM64)
1066     {
1067         int4 mul = vmull_s16(cast(short4)a, cast(short4)b);
1068 
1069         // Rounding narrowing shift right
1070         // (int16_t)((mul + 16384) >> 15);
1071         return cast(__m64) vrshrn_n_s32(mul, 15);
1072     }
1073     else
1074     {
1075         short4 sa = cast(short4)a;
1076         short4 sb = cast(short4)b;
1077         short4 r;
1078 
1079         for (int i = 0; i < 4; ++i)
1080         {
1081             r.ptr[i] = cast(short) ( (sa.array[i] * sb.array[i] + 0x4000) >> 15);
1082         }
1083         return cast(__m64)r;
1084     }
1085 }
1086 unittest
1087 {
1088     __m64 A = _mm_setr_pi16(12345, -32768, 32767, 0);
1089     __m64 B = _mm_setr_pi16(8877, -24487, 15678, 32760);
1090     short4 C = cast(short4) _mm_mulhrs_pi16(A, B);
1091     short[4] correct = [3344, 24487, 15678, 0];
1092     assert(C.array == correct);
1093 }
1094 
1095 
1096 /// Shuffle packed 8-bit integers in `a` according to shuffle control mask in the corresponding 8-bit element of `b`.
1097 __m128i _mm_shuffle_epi8 (__m128i a, __m128i b) @trusted
1098 {
1099     // This is the lovely pshufb.
1100     // PERF DMD
1101     static if (GDC_with_SSSE3)
1102     {
1103         return cast(__m128i) __builtin_ia32_pshufb128(cast(byte16) a, cast(byte16) b);
1104     }
1105     else static if (LDC_with_SSSE3)
1106     {
1107         return cast(__m128i) __builtin_ia32_pshufb128(cast(byte16) a, cast(byte16) b);
1108     }
1109     else static if (LDC_with_ARM64)
1110     {
1111         byte16 bb = cast(byte16)b;
1112         byte16 mask;
1113         mask = cast(byte)(0x8F);
1114         bb = bb & mask;
1115         byte16 r = vqtbl1q_s8(cast(byte16)a, bb);
1116         return cast(__m128i)r;
1117     }
1118     else
1119     {
1120         byte16 r;
1121         byte16 ba = cast(byte16)a;
1122         byte16 bb = cast(byte16)b;
1123         for (int i = 0; i < 16; ++i)
1124         {
1125             byte s = bb.array[i];
1126             r.ptr[i] = (s < 0) ? 0 : ba.array[ s & 15 ];
1127         }
1128         return cast(__m128i)r;
1129     }
1130 }
1131 unittest
1132 {
1133     __m128i A = _mm_setr_epi8(15,   14,      13,  12, 11,  10, 9, 8, 7, 6,  5,  4,  3,  2,  1,  0);
1134     __m128i B = _mm_setr_epi8(15, -128, 13 + 16, -12, 11, -10, 9, 8, 7, 6, -5,  4,  3, -2,  1,  0);
1135     byte16 C = cast(byte16) _mm_shuffle_epi8(A, B);
1136     byte[16] correct =         [0,   0,       2,  0,  4,   0, 6, 7, 8, 9,  0, 11, 12,  0, 14, 15];
1137     assert(C.array == correct);
1138 }
1139 
1140 /// Shuffle packed 8-bit integers in `a` according to shuffle control mask in the corresponding 8-bit element of `b`.
1141 __m64 _mm_shuffle_pi8 (__m64 a, __m64 b) @trusted
1142 {
1143     // PERF DMD
1144     static if (GDC_with_SSSE3)
1145     {
1146         alias ubyte8  =__vector(ubyte[8]);
1147         return cast(__m64) __builtin_ia32_pshufb(cast(ubyte8) a, cast(ubyte8) b);
1148     }
1149     else static if (LDC_with_SSSE3)
1150     {
1151         // GDC does proper dance to avoid mmx registers, do it manually in LDC since __builtin_ia32_pshufb doesn't exist there
1152         __m128i A = to_m128i(a);
1153         __m128i index = to_m128i(b);
1154         index = index & _mm_set1_epi32(0xF7F7F7F7);
1155         return to_m64( cast(__m128i) __builtin_ia32_pshufb128(cast(byte16)A, cast(byte16) index) );
1156     }
1157     else static if (LDC_with_ARM64)
1158     {
1159         byte8 bb = cast(byte8)b;
1160         byte8 mask;
1161         mask = cast(byte)(0x87);
1162         bb = bb & mask;
1163         __m128i l = to_m128i(a);
1164         byte8 r = vtbl1_s8(cast(byte16)l, cast(byte8)bb);
1165         return cast(__m64)r;
1166     }
1167     else
1168     {
1169         byte8 r;
1170         byte8 ba = cast(byte8)a;
1171         byte8 bb = cast(byte8)b;
1172         for (int i = 0; i < 8; ++i)
1173         {
1174             byte s = bb.array[i];
1175             r.ptr[i] = (s < 0) ? 0 : ba.array[ s & 7 ];
1176         }
1177         return cast(__m64)r;
1178     }
1179 }
1180 unittest
1181 {
1182     __m64 A = _mm_setr_pi8(7,  6,  5,  4,      3,  2,  1,  0);
1183     __m64 B = _mm_setr_pi8(7,  6, -5,  4,  3 + 8, -2,  1,  0);
1184     byte8 C = cast(byte8) _mm_shuffle_pi8(A, B);
1185     byte[8] correct =    [0,  1,  0,  3,      4,  0,  6,  7];
1186     assert(C.array == correct);
1187 }
1188 
1189 /// Negate packed 16-bit integers in `a` when the corresponding signed 16-bit integer in `b` is negative.
1190 /// Elements in result are zeroed out when the corresponding element in `b` is zero.
1191 __m128i _mm_sign_epi16 (__m128i a, __m128i b) @trusted
1192 {
1193     // PERF DMD
1194     static if (GDC_with_SSSE3)
1195     {
1196         return cast(__m128i) __builtin_ia32_psignw128(cast(short8)a, cast(short8)b);
1197     }
1198     else static if (LDC_with_SSSE3)
1199     {
1200         return cast(__m128i) __builtin_ia32_psignw128(cast(short8)a, cast(short8)b);       
1201     }
1202     else
1203     {
1204         // LDC arm64: 5 instructions
1205         __m128i mask = _mm_srai_epi16(b, 15);
1206         __m128i zeromask = _mm_cmpeq_epi16(b, _mm_setzero_si128());
1207         return _mm_andnot_si128(zeromask, _mm_xor_si128(_mm_add_epi16(a, mask), mask));
1208     }
1209 }
1210 unittest
1211 {
1212     __m128i A = _mm_setr_epi16(-2, -1, 0, 1,  2, short.min, short.min, short.min);
1213     __m128i B = _mm_setr_epi16(-1,  0,-1, 1, -2,       -50,         0,        50);
1214     short8 C = cast(short8) _mm_sign_epi16(A, B);
1215     short[8] correct =        [ 2,  0, 0, 1, -2, short.min,         0, short.min];
1216     assert(C.array == correct);
1217 }
1218 
1219 /// Negate packed 32-bit integers in `a` when the corresponding signed 32-bit integer in `b` is negative. 
1220 /// Elements in result are zeroed out when the corresponding element in `b` is zero.
1221 __m128i _mm_sign_epi32 (__m128i a, __m128i b) @trusted
1222 {
1223     // PERF DMD
1224     static if (GDC_with_SSSE3)
1225     {
1226         return cast(__m128i) __builtin_ia32_psignd128(cast(short8)a, cast(short8)b);
1227     }
1228     else static if (LDC_with_SSSE3)
1229     {
1230         return cast(__m128i) __builtin_ia32_psignd128(cast(short8)a, cast(short8)b);
1231     }
1232     else
1233     {
1234         __m128i mask = _mm_srai_epi32(b, 31);
1235         __m128i zeromask = _mm_cmpeq_epi32(b, _mm_setzero_si128());
1236         return _mm_andnot_si128(zeromask, _mm_xor_si128(_mm_add_epi32(a, mask), mask));
1237     }
1238 }
1239 unittest
1240 {
1241     __m128i A = _mm_setr_epi32(-2, -1,  0, int.max);
1242     __m128i B = _mm_setr_epi32(-1,  0, -1, 1);
1243     int4 C = cast(int4) _mm_sign_epi32(A, B);
1244     int[4] correct =          [ 2,  0, 0, int.max];
1245     assert(C.array == correct);
1246 }
1247 
1248 /// Negate packed 8-bit integers in `a` when the corresponding signed 8-bit integer in `b` is negative. 
1249 /// Elements in result are zeroed out when the corresponding element in `b` is zero.
1250 __m128i _mm_sign_epi8 (__m128i a, __m128i b) @trusted
1251 {
1252     // PERF DMD
1253     static if (GDC_with_SSSE3)
1254     {
1255         return cast(__m128i) __builtin_ia32_psignb128(cast(byte16)a, cast(byte16)b);
1256     }
1257     else static if (LDC_with_SSSE3)
1258     {
1259         return cast(__m128i) __builtin_ia32_psignb128(cast(byte16)a, cast(byte16)b);
1260     }
1261     else
1262     {
1263         __m128i mask = _mm_cmplt_epi8(b, _mm_setzero_si128()); // extend sign bit
1264         __m128i zeromask = _mm_cmpeq_epi8(b, _mm_setzero_si128());
1265         return _mm_andnot_si128(zeromask, _mm_xor_si128(_mm_add_epi8(a, mask), mask));
1266     }
1267 }
1268 unittest
1269 {
1270     __m128i A = _mm_setr_epi8(-2, -1, 0, 1,  2, byte.min, byte.min, byte.min, -1,  0,-1, 1, -2,      -50,        0,       50);
1271     __m128i B = _mm_setr_epi8(-1,  0,-1, 1, -2,      -50,        0,       50, -2, -1, 0, 1,  2, byte.min, byte.min, byte.min);
1272     byte16  C = cast(byte16) _mm_sign_epi8(A, B);
1273     byte[16] correct =       [ 2,  0, 0, 1, -2, byte.min,        0, byte.min,  1,  0, 0, 1, -2,       50,        0,      -50];
1274     assert(C.array == correct);
1275 }
1276 
1277 /// Negate packed 16-bit integers in `a`  when the corresponding signed 16-bit integer in `b` is negative.
1278 /// Element in result are zeroed out when the corresponding element in `b` is zero.
1279 __m64 _mm_sign_pi16 (__m64 a, __m64 b) @trusted
1280 {
1281     return to_m64( _mm_sign_epi16( to_m128i(a), to_m128i(b)) );
1282 }
1283 unittest
1284 {
1285     __m64 A = _mm_setr_pi16( 2, short.min, short.min, short.min);
1286     __m64 B = _mm_setr_pi16(-2,       -50,         0,        50);
1287     short4 C = cast(short4) _mm_sign_pi16(A, B);
1288     short[4] correct =     [-2, short.min,         0, short.min];
1289     assert(C.array == correct);
1290 }
1291 
1292 /// Negate packed 32-bit integers in `a`  when the corresponding signed 32-bit integer in `b` is negative.
1293 /// Element in result are zeroed out when the corresponding element in `b` is zero.
1294 __m64 _mm_sign_pi32 (__m64 a, __m64 b) @trusted
1295 {
1296     return to_m64( _mm_sign_epi32( to_m128i(a), to_m128i(b)) );
1297 }
1298 unittest
1299 {
1300     __m64 A = _mm_setr_pi32(-2, -100);
1301     __m64 B = _mm_setr_pi32(-1,  0);
1302     int2 C = cast(int2) _mm_sign_pi32(A, B);
1303     int[2] correct =          [ 2,  0];
1304     assert(C.array == correct);
1305 }
1306 
1307 /// Negate packed 8-bit integers in `a` when the corresponding signed 8-bit integer in `b` is negative. 
1308 /// Elements in result are zeroed out when the corresponding element in `b` is zero.
1309 __m64 _mm_sign_pi8 (__m64 a, __m64 b) @trusted
1310 {
1311     return to_m64( _mm_sign_epi8( to_m128i(a), to_m128i(b)) );
1312 }
1313 unittest
1314 {
1315     __m64 A = _mm_setr_pi8(-2, -1, 0, 1,  2, byte.min, byte.min, byte.min);
1316     __m64 B = _mm_setr_pi8(-1,  0,-1, 1, -2,      -50,        0,       50);
1317     byte8  C = cast(byte8) _mm_sign_pi8(A, B);
1318     byte[8] correct =     [ 2,  0, 0, 1, -2, byte.min,        0, byte.min];
1319     assert(C.array == correct);
1320 }