1 /**
2 * SSSE3 intrinsics.
3 * https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#techs=SSSE3
4 *
5 * Copyright: Guillaume Piolat 2021.
6 *            Johan Engelen 2021.
7 * License:   $(LINK2 http://www.boost.org/LICENSE_1_0.txt, Boost License 1.0)
8 */
9 module inteli.tmmintrin;
10 
11 public import inteli.types;
12 import inteli.internals;
13 
14 public import inteli.pmmintrin;
15 import inteli.mmx;
16 
17 nothrow @nogc:
18 
19 
20 // SSSE3 instructions
21 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#techs=SSSE3
22 // Note: this header will work whether you have SSSE3 enabled or not.
23 // With LDC, use "dflags-ldc": ["-mattr=+ssse3"] or equivalent to actively 
24 // generate SSE3 instructions.
25 // With GDC, use "dflags-gdc": ["-mssse3"] or equivalent to generate SSSE3 instructions.
26 
27 /// Compute the absolute value of packed signed 16-bit integers in `a`.
28 __m128i _mm_abs_epi16 (__m128i a) @trusted
29 {
30     static if (DMD_with_DSIMD)
31     {
32         return cast(__m128i)__simd(XMM.PABSW, a);
33     }
34     else static if (GDC_with_SSSE3)
35     {
36         return cast(__m128i) __builtin_ia32_pabsw128(cast(short8)a);
37     }
38     else static if (LDC_with_ARM64)
39     {
40         return cast(__m128i) vabsq_s16(cast(short8)a);
41     }
42     else
43     {
44         // LDC x86: generate pabsw since LDC 1.1 -O2
45         short8 sa = cast(short8)a;
46         for (int i = 0; i < 8; ++i)
47         {
48             short s = sa.array[i];
49             sa.ptr[i] = s >= 0 ? s : cast(short)(-cast(int)(s));
50         }  
51         return cast(__m128i)sa;
52     }
53 }
54 unittest
55 {
56     __m128i A = _mm_setr_epi16(0, -1, -32768, 32767, 10, -10, 1000, -1000);
57     short8 B = cast(short8) _mm_abs_epi16(A);
58     short[8] correct = [0, 1, -32768, 32767, 10, 10, 1000, 1000];
59     assert(B.array == correct);
60 }
61 
62 /// Compute the absolute value of packed signed 32-bit integers in `a`.
63 __m128i _mm_abs_epi32 (__m128i a) @trusted
64 {
65     static if (DMD_with_DSIMD)
66     {
67         return cast(__m128i)__simd(XMM.PABSD, cast(int4)a);
68     }
69     else static if (GDC_with_SSSE3)
70     {
71         return cast(__m128i) __builtin_ia32_pabsd128(cast(int4)a);
72     }
73     else static if (LDC_with_ARM64)
74     {
75         return cast(__m128i) vabsq_s32(cast(int4)a);
76     }
77     else
78     {
79         // LDC x86: generates pabsd since LDC 1.1 -O2
80         int4 sa = cast(int4)a;
81         for (int i = 0; i < 4; ++i)
82         {
83             int s = sa.array[i];
84             sa.ptr[i] = s >= 0 ? s : -s;
85         }  
86         return cast(__m128i)sa;
87     } 
88 }
89 unittest
90 {
91     __m128i A = _mm_setr_epi32(0, -1, -2_147_483_648, -2_147_483_647);
92     int4 B = cast(int4) _mm_abs_epi32(A);
93     int[4] correct = [0, 1, -2_147_483_648, 2_147_483_647];
94     assert(B.array == correct);
95 }
96 
97 /// Compute the absolute value of packed signed 8-bit integers in `a`.
98 __m128i _mm_abs_epi8 (__m128i a) @trusted
99 {
100     static if (DMD_with_DSIMD)
101     {
102         return cast(__m128i)__simd(XMM.PABSB, cast(byte16)a);
103     }
104     else static if (GDC_with_SSSE3)
105     {
106         alias ubyte16 = __vector(ubyte[16]);
107         return cast(__m128i) __builtin_ia32_pabsb128(cast(ubyte16)a);
108     }
109     else static if (LDC_with_ARM64)
110     {
111         return cast(__m128i) vabsq_s8(cast(byte16)a);
112     }
113     else static if (LDC_with_optimizations)
114     {
115         // LDC x86: generates pabsb since LDC 1.1 -O1
116         //     arm64: generates abs since LDC 1.8 -O1
117         enum ir = `
118                 %n = sub <16 x i8> <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, %0
119                 %s = icmp slt <16 x i8> <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, %0
120                 %r = select <16 x i1> %s, <16 x i8> %0, <16 x i8> %n
121                 ret <16 x i8> %r`;
122         return cast(__m128i) LDCInlineIR!(ir, byte16, byte16)(cast(byte16)a);
123     }
124     else
125     {
126         // A loop version like in _mm_abs_epi16/_mm_abs_epi32 would be very slow 
127         // in LDC x86 and wouldn't vectorize. Doesn't generate pabsb in LDC though.
128         return _mm_min_epu8(a, _mm_sub_epi8(_mm_setzero_si128(), a));
129     }
130 }
131 unittest
132 {
133     __m128i A = _mm_setr_epi8(0, -1, -128, -127, 127, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
134     byte16 B = cast(byte16) _mm_abs_epi8(A);
135     byte[16] correct =       [0,  1, -128,  127, 127, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0];
136     assert(B.array == correct);
137 }
138 
139 /// Compute the absolute value of packed 64-bit floating-point elements in `a`.
140 /// #BONUS.
141 __m128d _mm_abs_pd (__m128d a) @trusted
142 {
143     long2 mask = 0x7fff_ffff_ffff_ffff;
144     return cast(__m128d)((cast(long2)a) & mask);
145 }
146 unittest
147 {
148     __m128d A = _mm_setr_pd(-42.0f, -double.infinity);
149     __m128d R = _mm_abs_pd(A);
150     double[2] correct =    [42.0f, +double.infinity];
151     assert(R.array == correct);
152 }
153 
154 /// Compute the absolute value of packed signed 16-bit integers in `a`.
155 __m64 _mm_abs_pi16 (__m64 a) @trusted
156 {
157     return to_m64(_mm_abs_epi16(to_m128i(a)));
158 }
159 unittest
160 {
161     __m64 A = _mm_setr_pi16(0, -1, -32768, 32767);
162     short4 B = cast(short4) _mm_abs_pi16(A);
163     short[4] correct = [0, 1, -32768, 32767];
164     assert(B.array == correct);
165 }
166 
167 /// Compute the absolute value of packed signed 32-bit integers in `a`.
168 __m64 _mm_abs_pi32 (__m64 a) @trusted
169 {
170      return to_m64(_mm_abs_epi32(to_m128i(a)));
171 }
172 unittest
173 {
174     __m64 A = _mm_setr_pi32(-1, -2_147_483_648);
175     int2 B = cast(int2) _mm_abs_pi32(A);
176     int[2] correct = [1, -2_147_483_648];
177     assert(B.array == correct);
178 }
179 
180 /// Compute the absolute value of packed signed 8-bit integers in `a`.
181 __m64 _mm_abs_pi8 (__m64 a) @trusted
182 {
183     return to_m64(_mm_abs_epi8(to_m128i(a)));
184 }
185 unittest
186 {
187     __m64 A = _mm_setr_pi8(0, -1, -128, -127, 127, 0, 0, 0);
188     byte8 B = cast(byte8) _mm_abs_pi8(A);
189     byte[8] correct =       [0,  1, -128,  127, 127, 0, 0, 0];
190     assert(B.array == correct);
191 }
192 
193 /// Compute the absolute value of packed 32-bit floating-point elements in `a`.
194 /// #BONUS.
195 __m128 _mm_abs_ps (__m128 a) @trusted
196 {
197     __m128i mask = 0x7fffffff;
198     return cast(__m128)((cast(__m128i)a) & mask);
199 }
200 unittest
201 {
202     __m128 A = _mm_setr_ps(-0.0f, 10.0f, -42.0f, -float.infinity);
203     __m128 R = _mm_abs_ps(A);
204     float[4] correct =    [0.0f, 10.0f, 42.0f, +float.infinity];
205     assert(R.array == correct);
206 }
207 
208 /// Concatenate 16-byte blocks in `a` and `b` into a 32-byte temporary result, shift the result right by `count` bytes, and return the low 16 bytes.
209 __m128i _mm_alignr_epi8(ubyte count)(__m128i a, __m128i b) @trusted
210 {
211     // PERF DMD
212     static if (GDC_with_SSSE3)
213     {
214         return cast(__m128i)__builtin_ia32_palignr128(cast(long2)a, cast(long2)b, count * 8);
215     }
216     else version(LDC)
217     {
218         static if (count >= 32)
219         {
220             return _mm_setzero_si128();
221         }
222         else static if (count < 16)
223         {
224             // Generates palignr since LDC 1.1 -O1
225             // Also generates a single ext instruction on arm64.
226             return cast(__m128i) shufflevectorLDC!(byte16, ( 0 + count),
227                                                         ( 1 + count),
228                                                         ( 2 + count),
229                                                         ( 3 + count),
230                                                         ( 4 + count),
231                                                         ( 5 + count),
232                                                         ( 6 + count),
233                                                         ( 7 + count),
234                                                         ( 8 + count),
235                                                         ( 9 + count),
236                                                         (10 + count),
237                                                         (11 + count),
238                                                         (12 + count),
239                                                         (13 + count),
240                                                         (14 + count),
241                                                         (15 + count))(cast(byte16)b, cast(byte16)a);
242         }
243         else
244         {
245             return cast(__m128i) shufflevectorLDC!(byte16, ( 0 + count) % 32,
246                                                         ( 1 + count) % 32,
247                                                         ( 2 + count) % 32,
248                                                         ( 3 + count) % 32,
249                                                         ( 4 + count) % 32,
250                                                         ( 5 + count) % 32,
251                                                         ( 6 + count) % 32,
252                                                         ( 7 + count) % 32,
253                                                         ( 8 + count) % 32,
254                                                         ( 9 + count) % 32,
255                                                         (10 + count) % 32,
256                                                         (11 + count) % 32,
257                                                         (12 + count) % 32,
258                                                         (13 + count) % 32,
259                                                         (14 + count) % 32,
260                                                         (15 + count) % 32)(cast(byte16)_mm_setzero_si128(), cast(byte16)a);
261         }
262     }
263     else
264     {
265         byte16 ab = cast(byte16)a;
266         byte16 bb = cast(byte16)b;
267         byte16 r;
268 
269         for (int i = 0; i < 16; ++i)
270         {
271             const int srcpos = count + cast(int)i;
272             if (srcpos > 31) 
273             {
274                 r.ptr[i] = 0;
275             } 
276             else if (srcpos > 15) 
277             {
278                 r.ptr[i] = ab.array[(srcpos) & 15];
279             } 
280             else 
281             {
282                 r.ptr[i] = bb.array[srcpos];
283             }
284        }
285        return cast(__m128i)r;
286     }
287 }
288 unittest
289 {
290     __m128i A = _mm_setr_epi8(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
291     __m128i B = _mm_setr_epi8(17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32);
292 
293     {
294         byte16 C = cast(byte16)_mm_alignr_epi8!0(A ,B);
295         byte[16] correct = [17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32];
296         assert(C.array == correct);
297     }
298     {
299         byte16 C = cast(byte16)_mm_alignr_epi8!20(A ,B);
300         byte[16] correct = [5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 0, 0, 0, 0];
301         assert(C.array == correct);
302     }
303     {
304         byte16 C = cast(byte16)_mm_alignr_epi8!34(A ,B);
305         byte[16] correct = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0];
306         assert(C.array == correct);
307     }
308 
309     __m128i D = _mm_setr_epi8(-123, -82, 103, -69, 103, -26, 9, 106, 58, -11, 79, -91, 114, -13, 110, 60);
310     __m128i E = _mm_setr_epi8(25, -51, -32, 91, -85, -39, -125, 31, -116, 104, 5, -101, 127, 82, 14, 81);
311     byte16 F = cast(byte16)_mm_alignr_epi8!8(D, E);
312     byte[16] correct = [-116, 104, 5, -101, 127, 82, 14, 81, -123, -82, 103, -69, 103, -26, 9, 106];
313     assert(F.array == correct);
314 }
315 
316 /// Concatenate 8-byte blocks in `a` and `b` into a 16-byte temporary result, shift the result right by `count` bytes, and return the low 8 bytes.
317 __m64 _mm_alignr_pi8(ubyte count)(__m64 a, __m64 b) @trusted
318 {
319     // PERF DMD
320     static if (GDC_with_SSSE3)
321     {
322         return cast(__m64)__builtin_ia32_palignr(cast(long1)a, cast(long1)b, count * 8);
323     }
324     else version(LDC)
325     {
326         static if (count >= 16)
327         {
328             return _mm_setzero_si64();
329         }
330         else static if (count < 8)
331         {
332             // Note: in LDC x86 this uses a pshufb.
333             // Generates ext in arm64.
334             return cast(__m64) shufflevectorLDC!(byte8, (0 + count),
335                                                      (1 + count),
336                                                      (2 + count),
337                                                      (3 + count),
338                                                      (4 + count),
339                                                      (5 + count),
340                                                      (6 + count),
341                                                      (7 + count))(cast(byte8)b, cast(byte8)a);
342         }
343         else
344         {
345             return cast(__m64) shufflevectorLDC!(byte8, (0 + count)%16,
346                                                      (1 + count)%16,
347                                                      (2 + count)%16,
348                                                      (3 + count)%16,
349                                                      (4 + count)%16,
350                                                      (5 + count)%16,
351                                                      (6 + count)%16,
352                                                      (7 + count)%16)(cast(byte8)_mm_setzero_si64(), cast(byte8)a);
353         }
354     }
355     else
356     {
357         byte8 ab = cast(byte8)a;
358         byte8 bb = cast(byte8)b;
359         byte8 r;
360 
361         for (int i = 0; i < 8; ++i)
362         {
363             const int srcpos = count + cast(int)i;
364             if (srcpos > 15) 
365             {
366                 r.ptr[i] = 0;
367             } 
368             else if (srcpos > 7) 
369             {
370                 r.ptr[i] = ab.array[(srcpos) & 7];
371             } 
372             else 
373             {
374                 r.ptr[i] = bb.array[srcpos];
375             }
376        }
377        return cast(__m64)r;
378     }
379 }
380 unittest
381 {
382     __m64 A = _mm_setr_pi8(1, 2, 3, 4, 5, 6, 7, 8);
383     __m64 B = _mm_setr_pi8(17, 18, 19, 20, 21, 22, 23, 24);
384 
385     {
386         byte8 C = cast(byte8)_mm_alignr_pi8!0(A ,B);
387         byte[8] correct = [17, 18, 19, 20, 21, 22, 23, 24];
388         assert(C.array == correct);
389     }
390 
391     {
392         byte8 C = cast(byte8)_mm_alignr_pi8!3(A ,B);
393         byte[8] correct = [ 20, 21, 22, 23, 24, 1, 2, 3];
394         assert(C.array == correct);
395     }
396     {
397         byte8 C = cast(byte8)_mm_alignr_pi8!11(A ,B);
398         byte[8] correct = [4, 5, 6, 7, 8, 0, 0, 0];
399         assert(C.array == correct);
400     }
401     {
402         byte8 C = cast(byte8)_mm_alignr_pi8!17(A ,B);
403         byte[8] correct = [0, 0, 0, 0, 0, 0, 0, 0];
404         assert(C.array == correct);
405     }
406 }
407 
408 /// Reverse endianness of 16-bit integers in `a`.
409 __m128i _mm_bswap_epi16 (__m128i a) pure @safe // #BONUS
410 {
411     __m128i order = _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
412     return _mm_shuffle_epi8(a, order);
413 }
414 unittest
415 {
416     __m128i A = _mm_setr_epi16(0x1122, 0x3344, 0, -1, 0x1122, 0x3344, 0, -1);
417     short8 R = cast(short8) _mm_bswap_epi16(A);
418     short[8] correct = [0x2211, 0x4433, 0, -1, 0x2211, 0x4433, 0, -1];
419     assert(R.array == correct);
420 }
421 
422 /// Reverse endianness of 32-bit integers in `a`.
423 __m128i _mm_bswap_epi32 (__m128i a) pure @safe // #BONUS
424 {
425     __m128i order = _mm_setr_epi8(3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12);
426     return _mm_shuffle_epi8(a, order);
427 }
428 unittest
429 {
430     __m128i A = _mm_setr_epi32(0x11223344, 0x33445566, 0, -1);
431     int4 R = cast(int4) _mm_bswap_epi32(A);
432     int[4] correct = [0x44332211, 0x66554433, 0, -1];
433     assert(R.array == correct);
434 }
435 
436 /// Reverse endianness of 64-bit integers in `a`.
437 __m128i _mm_bswap_epi64 (__m128i a) pure @safe // #BONUS
438 {
439     __m128i order = _mm_setr_epi8(7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8);
440     return _mm_shuffle_epi8(a, order);
441 }
442 unittest
443 {
444     __m128i A = _mm_setr_epi64(0x11223344_55667788, -1);
445     long2 R = cast(long2) _mm_bswap_epi64(A);
446     long[2] correct = [0x88776655_44332211, -1];
447     assert(R.array == correct);
448 }
449 
450 /// Reverse endianness of 128-bit register `a`.
451 __m128i _mm_bswap_si128 (__m128i a) pure @safe // #BONUS
452 {
453     __m128i order = _mm_setr_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
454     return _mm_shuffle_epi8(a, order);
455 }
456 unittest
457 {
458     __m128i A = _mm_setr_epi64(0x11223344_55667788, -1);
459     long2 R = cast(long2) _mm_bswap_si128(A);
460     long[2] correct = [-1, 0x88776655_44332211, ];
461     assert(R.array == correct);
462 }
463 
464 /// Horizontally add adjacent pairs of 16-bit integers in `a` and `b`, and pack the signed 16-bit results.
465 __m128i _mm_hadd_epi16 (__m128i a, __m128i b) pure @trusted
466 {
467     // PERF DMD
468     static if (GDC_with_SSSE3)
469     {
470         return cast(__m128i)__builtin_ia32_phaddw128(cast(short8)a, cast(short8)b);
471     }
472     else static if (LDC_with_SSSE3)
473     {
474         return cast(__m128i)__builtin_ia32_phaddw128(cast(short8)a, cast(short8)b);
475     }
476     else static if (LDC_with_ARM64)
477     {
478         return cast(__m128i)vpaddq_s16(cast(short8)a, cast(short8)b);
479     }
480     else
481     {
482         short8 sa = cast(short8)a;
483         short8 sb = cast(short8)b;
484         short8 r;
485         r.ptr[0] = cast(short)(sa.array[0] + sa.array[1]);
486         r.ptr[1] = cast(short)(sa.array[2] + sa.array[3]);
487         r.ptr[2] = cast(short)(sa.array[4] + sa.array[5]);
488         r.ptr[3] = cast(short)(sa.array[6] + sa.array[7]);
489         r.ptr[4] = cast(short)(sb.array[0] + sb.array[1]);
490         r.ptr[5] = cast(short)(sb.array[2] + sb.array[3]);
491         r.ptr[6] = cast(short)(sb.array[4] + sb.array[5]);
492         r.ptr[7] = cast(short)(sb.array[6] + sb.array[7]);
493         return cast(__m128i)r;
494     }
495 }
496 unittest
497 {
498     __m128i A = _mm_setr_epi16(1, -2, 4, 8, 16, 32, -1, -32768);
499     short8 C = cast(short8) _mm_hadd_epi16(A, A);
500     short[8] correct = [ -1, 12, 48, 32767, -1, 12, 48, 32767];
501     assert(C.array == correct);
502 }
503 
504 /// Horizontally add adjacent pairs of 32-bit integers in `a` and `b`, and pack the signed 32-bit results.
505 __m128i _mm_hadd_epi32 (__m128i a, __m128i b) pure @trusted
506 {
507     // PERF DMD
508     static if (GDC_with_SSSE3)
509     {
510         return cast(__m128i)__builtin_ia32_phaddd128(cast(int4)a, cast(int4)b);
511     }
512     else static if (LDC_with_SSSE3)
513     {
514         return cast(__m128i)__builtin_ia32_phaddd128(cast(int4)a, cast(int4)b);
515     }
516     else static if (LDC_with_ARM64)
517     {
518         return cast(__m128i)vpaddq_s32(cast(int4)a, cast(int4)b);
519     }
520     else
521     {
522         int4 ia = cast(int4)a;
523         int4 ib = cast(int4)b;
524         int4 r;
525         r.ptr[0] = ia.array[0] + ia.array[1];
526         r.ptr[1] = ia.array[2] + ia.array[3];
527         r.ptr[2] = ib.array[0] + ib.array[1];
528         r.ptr[3] = ib.array[2] + ib.array[3];
529         return cast(__m128i)r;
530     }
531 }
532 unittest
533 {
534     __m128i A = _mm_setr_epi32(1, -2, int.min, -1);
535     __m128i B = _mm_setr_epi32(1, int.max, 4, -4);
536     int4 C = cast(int4) _mm_hadd_epi32(A, B);
537     int[4] correct = [ -1, int.max, int.min, 0 ];
538     assert(C.array == correct);
539 }
540 
541 /// Horizontally add adjacent pairs of 16-bit integers in `a` and `b`, and pack the signed 16-bit results.
542 __m64 _mm_hadd_pi16 (__m64 a, __m64 b) @trusted
543 {
544     // PERF DMD
545     static if (GDC_with_SSSE3)
546     {
547         return cast(__m64) __builtin_ia32_phaddw(cast(short4)a, cast(short4)b);
548     }
549     else static if (LDC_with_ARM64)
550     {
551         return cast(__m64) vpadd_s16(cast(short4)a, cast(short4)b);
552     }
553     else
554     {
555         // LDC x86: generates phaddw since LDC 1.24 -O2.
556         short4 r;
557         short4 sa = cast(short4)a;
558         short4 sb = cast(short4)b;
559         r.ptr[0] = cast(short)(sa.array[0] + sa.array[1]); 
560         r.ptr[1] = cast(short)(sa.array[2] + sa.array[3]);
561         r.ptr[2] = cast(short)(sb.array[0] + sb.array[1]);
562         r.ptr[3] = cast(short)(sb.array[2] + sb.array[3]);
563         return cast(__m64)r;
564     }
565 }
566 unittest
567 {
568     __m64 A = _mm_setr_pi16(1, -2, 4, 8);
569     __m64 B = _mm_setr_pi16(16, 32, -1, -32768);
570     short4 C = cast(short4) _mm_hadd_pi16(A, B);
571     short[4] correct = [ -1, 12, 48, 32767 ];
572     assert(C.array == correct);
573 }
574 
575 /// Horizontally add adjacent pairs of 32-bit integers in `a` and `b`, 
576 /// and pack the signed 32-bit results.
577 __m64 _mm_hadd_pi32 (__m64 a, __m64 b) @trusted
578 {
579     // PERF DMD
580     static if (GDC_with_SSSE3)
581     {
582         return cast(__m64) __builtin_ia32_phaddd(cast(int2)a, cast(int2)b);
583     }
584     else static if (LDC_with_ARM64)
585     {
586         return cast(__m64)vpadd_s32(cast(int2)a, cast(int2)b);
587     }
588     else
589     {
590         // LDC x86: generates phaddd since LDC 1.24 -O2
591         int2 ia = cast(int2)a;
592         int2 ib = cast(int2)b;
593         int2 r;
594         r.ptr[0] = ia.array[0] + ia.array[1];
595         r.ptr[1] = ib.array[0] + ib.array[1];
596         return cast(__m64)r;
597     }
598 }
599 unittest
600 {
601     __m64 A = _mm_setr_pi32(int.min, -1);
602     __m64 B = _mm_setr_pi32(1, int.max);
603     int2 C = cast(int2) _mm_hadd_pi32(A, B);
604     int[2] correct = [ int.max, int.min ];
605     assert(C.array == correct);
606 }
607 
608 /// Horizontally add adjacent pairs of signed 16-bit integers in `a` and `b` using saturation, 
609 /// and pack the signed 16-bit results.
610 __m128i _mm_hadds_epi16 (__m128i a, __m128i b) pure @trusted
611 {
612     // PERF DMD
613     static if (GDC_with_SSSE3)
614     {
615         return cast(__m128i)__builtin_ia32_phaddsw128(cast(short8)a, cast(short8)b);
616     }
617     else static if (LDC_with_SSSE3)
618     {
619         return cast(__m128i)__builtin_ia32_phaddsw128(cast(short8)a, cast(short8)b);
620     }
621     else static if (LDC_with_ARM64)
622     {
623         // uzp1/uzp2/sqadd sequence
624         short8 sa = cast(short8)a;
625         short8 sb = cast(short8)b;
626         short8 c = shufflevectorLDC!(short8, 0, 2, 4, 6, 8, 10, 12, 14)(sa, sb);
627         short8 d = shufflevectorLDC!(short8, 1, 3, 5, 7, 9, 11, 13, 15)(sa, sb);
628         return cast(__m128i)vqaddq_s16(c, d);
629     }
630     else
631     {
632         // PERF well that doesn't look very fast?
633         short8 sa = cast(short8)a;
634         short8 sb = cast(short8)b;
635         short8 r;
636         r.ptr[0] = saturateSignedIntToSignedShort(sa.array[0] + sa.array[1]);
637         r.ptr[1] = saturateSignedIntToSignedShort(sa.array[2] + sa.array[3]);
638         r.ptr[2] = saturateSignedIntToSignedShort(sa.array[4] + sa.array[5]);
639         r.ptr[3] = saturateSignedIntToSignedShort(sa.array[6] + sa.array[7]);
640         r.ptr[4] = saturateSignedIntToSignedShort(sb.array[0] + sb.array[1]);
641         r.ptr[5] = saturateSignedIntToSignedShort(sb.array[2] + sb.array[3]);
642         r.ptr[6] = saturateSignedIntToSignedShort(sb.array[4] + sb.array[5]);
643         r.ptr[7] = saturateSignedIntToSignedShort(sb.array[6] + sb.array[7]);
644         return cast(__m128i)r;
645     }
646 }
647 unittest
648 {
649     __m128i A = _mm_setr_epi16(1, -2, 4, 8, 16, 32, -1, -32768);
650     short8 C = cast(short8) _mm_hadds_epi16(A, A);
651     short[8] correct = [ -1, 12, 48, -32768, -1, 12, 48, -32768];
652     assert(C.array == correct);
653 }
654 
655 /// Horizontally add adjacent pairs of signed 16-bit integers in `a` and `b` using saturation, 
656 /// and pack the signed 16-bit results.
657 __m64 _mm_hadds_pi16 (__m64 a, __m64 b) @trusted
658 {
659     static if (GDC_with_SSSE3)
660     {
661         return cast(__m64)__builtin_ia32_phaddsw(cast(short4)a, cast(short4)b);
662     }
663     else static if (LDC_with_SSSE3)
664     {
665         // Note: LDC doesn't have __builtin_ia32_phaddsw
666         long2 la;
667         la.ptr[0] = a.array[0];
668         long2 lb;
669         lb.ptr[0] = b.array[0];
670         int4 sum = cast(int4)__builtin_ia32_phaddsw128(cast(short8)la, cast(short8)lb);
671         int2 r;
672         r.ptr[0] = sum.array[0];
673         r.ptr[1] = sum.array[2];
674         return cast(__m64)r;
675     }
676     else static if (LDC_with_ARM64)
677     {
678         // uzp1/uzp2/sqadd sequence
679         short4 sa = cast(short4)a;
680         short4 sb = cast(short4)b;
681         short4 c = shufflevectorLDC!(short4, 0, 2, 4, 6)(sa, sb);
682         short4 d = shufflevectorLDC!(short4, 1, 3, 5, 7)(sa, sb);
683         return cast(__m64)vqadd_s16(c, d);
684     }
685     else
686     {
687         short4 sa = cast(short4)a;
688         short4 sb = cast(short4)b;
689         short4 r;
690         r.ptr[0] = saturateSignedIntToSignedShort(sa.array[0] + sa.array[1]);
691         r.ptr[1] = saturateSignedIntToSignedShort(sa.array[2] + sa.array[3]);
692         r.ptr[2] = saturateSignedIntToSignedShort(sb.array[0] + sb.array[1]);
693         r.ptr[3] = saturateSignedIntToSignedShort(sb.array[2] + sb.array[3]);
694         return cast(__m64)r;
695     }
696 }
697 unittest
698 {
699     __m64 A = _mm_setr_pi16(-16, 32, -100, -32768);
700     __m64 B = _mm_setr_pi16( 64, 32,    1,  32767);
701     short4 C = cast(short4) _mm_hadds_pi16(A, B);
702     short[4] correct = [ 16, -32768,  96,  32767];
703     assert(C.array == correct);
704 }
705 
706 
707 /// Horizontally add adjacent pairs of 16-bit integers in `a` and `b`, and pack the signed 16-bit results.
708 __m128i _mm_hsub_epi16 (__m128i a, __m128i b) @trusted
709 {
710     // PERF DMD
711     static if (GDC_with_SSSE3)
712     {
713         return cast(__m128i)__builtin_ia32_phsubw128(cast(short8)a, cast(short8)b);
714     }
715     else static if (LDC_with_SSSE3)
716     {
717         return cast(__m128i)__builtin_ia32_phsubw128(cast(short8)a, cast(short8)b);
718     }
719     else static if (LDC_with_ARM64)
720     {
721         // Produce uzp1 uzp2 sub sequence since LDC 1.8 -O1 
722         short8 sa = cast(short8)a;
723         short8 sb = cast(short8)b;
724         short8 c = shufflevectorLDC!(short8, 0, 2, 4, 6, 8, 10, 12, 14)(sa, sb);
725         short8 d = shufflevectorLDC!(short8, 1, 3, 5, 7, 9, 11, 13, 15)(sa, sb);
726         return cast(__m128i)(c - d);
727     }
728     else 
729     {
730         short8 sa = cast(short8)a;
731         short8 sb = cast(short8)b;
732         short8 r;
733         r.ptr[0] = cast(short)(sa.array[0] - sa.array[1]);
734         r.ptr[1] = cast(short)(sa.array[2] - sa.array[3]);
735         r.ptr[2] = cast(short)(sa.array[4] - sa.array[5]);
736         r.ptr[3] = cast(short)(sa.array[6] - sa.array[7]);
737         r.ptr[4] = cast(short)(sb.array[0] - sb.array[1]);
738         r.ptr[5] = cast(short)(sb.array[2] - sb.array[3]);
739         r.ptr[6] = cast(short)(sb.array[4] - sb.array[5]);
740         r.ptr[7] = cast(short)(sb.array[6] - sb.array[7]);
741         return cast(__m128i)r;
742     }
743 }
744 unittest
745 {
746     __m128i A = _mm_setr_epi16(short.min, 1, 4, 8, 16, 32, 1, -32768);
747     short8 C = cast(short8) _mm_hsub_epi16(A, A);
748     short[8] correct = [ short.max, -4, -16, -32767, short.max, -4, -16, -32767];
749     assert(C.array == correct);
750 }
751 
752 /// Horizontally add adjacent pairs of 32-bit integers in `a` and `b`, and pack the signed 32-bit results.
753 __m128i _mm_hsub_epi32 (__m128i a, __m128i b) pure @trusted
754 {
755     // PERF DMD
756     static if (GDC_with_SSSE3)
757     {
758         return cast(__m128i)__builtin_ia32_phsubd128(cast(int4)a, cast(int4)b);
759     }
760     else static if (LDC_with_SSSE3)
761     {
762         return cast(__m128i)__builtin_ia32_phsubd128(cast(int4)a, cast(int4)b);
763     }
764     else static if (LDC_with_ARM64)
765     {
766         // Produce uzp1 uzp2 sub sequence since LDC 1.8 -O1 
767         int4 ia = cast(int4)a;
768         int4 ib = cast(int4)b;
769         int4 c = shufflevectorLDC!(int4, 0, 2, 4, 6)(ia, ib);
770         int4 d = shufflevectorLDC!(int4, 1, 3, 5, 7)(ia, ib);
771         return cast(__m128i)(c - d);
772     }
773     else
774     {
775         int4 ia = cast(int4)a;
776         int4 ib = cast(int4)b;
777         int4 r;
778         r.ptr[0] = ia.array[0] - ia.array[1];
779         r.ptr[1] = ia.array[2] - ia.array[3];
780         r.ptr[2] = ib.array[0] - ib.array[1];
781         r.ptr[3] = ib.array[2] - ib.array[3];
782         return cast(__m128i)r;
783     }
784 }
785 unittest
786 {
787     __m128i A = _mm_setr_epi32(1, 2, int.min, 1);
788     __m128i B = _mm_setr_epi32(int.max, -1, 4, 4);
789     int4 C = cast(int4) _mm_hsub_epi32(A, B);
790     int[4] correct = [ -1, int.max, int.min, 0 ];
791     assert(C.array == correct);
792 }
793 
794 /// Horizontally subtract adjacent pairs of 16-bit integers in `a` and `b`, 
795 /// and pack the signed 16-bit results.
796 __m64 _mm_hsub_pi16 (__m64 a, __m64 b) @trusted
797 {
798     // PERF DMD
799     static if (GDC_with_SSSE3)
800     {
801         return cast(__m64)__builtin_ia32_phsubw(cast(short4)a, cast(short4)b);
802     }
803     else static if (LDC_with_ARM64)
804     {
805         // Produce uzp1 uzp2 sub sequence since LDC 1.3 -O1 
806         short4 sa = cast(short4)a;
807         short4 sb = cast(short4)b;
808         short4 c = shufflevectorLDC!(short4, 0, 2, 4, 6)(sa, sb);
809         short4 d = shufflevectorLDC!(short4, 1, 3, 5, 7)(sa, sb);
810         return cast(__m64)(c - d);
811     }
812     else
813     {
814         // LDC x86: generates phsubw since LDC 1.24 -O2
815         short4 sa = cast(short4)a;
816         short4 sb = cast(short4)b;
817         short4 r;
818         r.ptr[0] = cast(short)(sa.array[0] - sa.array[1]);
819         r.ptr[1] = cast(short)(sa.array[2] - sa.array[3]);
820         r.ptr[2] = cast(short)(sb.array[0] - sb.array[1]);
821         r.ptr[3] = cast(short)(sb.array[2] - sb.array[3]);
822         return cast(__m64)r;
823     }
824 }
825 unittest
826 {
827     __m64 A = _mm_setr_pi16(short.min, 1, 4, 8);
828     __m64 B = _mm_setr_pi16(16, 32, 1, -32768);
829     short4 C = cast(short4) _mm_hsub_pi16(A, B);
830     short[4] correct = [ short.max, -4, -16, -32767];
831     assert(C.array == correct);
832 }
833 
834 /// Horizontally subtract adjacent pairs of 32-bit integers in `a` and `b`, 
835 /// and pack the signed 32-bit results.
836 __m64 _mm_hsub_pi32 (__m64 a, __m64 b) @trusted
837 {
838     // PERF DMD
839     static if (GDC_with_SSSE3)
840     {
841         return cast(__m64)__builtin_ia32_phsubd(cast(int2)a, cast(int2)b);
842     }
843     else static if (LDC_with_ARM64)
844     {
845         // LDC arm64: generates zip1+zip2+sub sequence since LDC 1.8 -O1
846         int2 ia = cast(int2)a;
847         int2 ib = cast(int2)b;
848         int2 c = shufflevectorLDC!(int2, 0, 2)(ia, ib);
849         int2 d = shufflevectorLDC!(int2, 1, 3)(ia, ib);
850         return cast(__m64)(c - d);
851     }
852     else
853     {
854         // LDC x86: generates phsubd since LDC 1.24 -O2
855         int2 ia = cast(int2)a;
856         int2 ib = cast(int2)b;
857         int2 r;
858         r.ptr[0] = ia.array[0] - ia.array[1];
859         r.ptr[1] = ib.array[0] - ib.array[1];
860         return cast(__m64)r;
861     }
862 }
863 unittest
864 {
865     __m64 A = _mm_setr_pi32(int.min, 1);
866     __m64 B = _mm_setr_pi32(int.max, -1);
867     int2 C = cast(int2) _mm_hsub_pi32(A, B);
868     int[2] correct = [ int.max, int.min ];
869     assert(C.array == correct);
870 }
871 
872 /// Horizontally subtract adjacent pairs of signed 16-bit integers in `a` and `b` using saturation, 
873 /// and pack the signed 16-bit results.
874 __m128i _mm_hsubs_epi16 (__m128i a, __m128i b) pure @trusted
875 {
876     // PERF DMD
877     static if (GDC_with_SSSE3)
878     {
879         return cast(__m128i)__builtin_ia32_phsubsw128(cast(short8)a, cast(short8)b);
880     }
881     else static if (LDC_with_SSSE3)
882     {
883         return cast(__m128i)__builtin_ia32_phsubsw128(cast(short8)a, cast(short8)b);
884     }
885     else static if (LDC_with_ARM64)
886     {
887         // uzp1/uzp2/sqsub sequence
888         short8 sa = cast(short8)a;
889         short8 sb = cast(short8)b;
890         short8 c = shufflevectorLDC!(short8, 0, 2, 4, 6, 8, 10, 12, 14)(sa, sb);
891         short8 d = shufflevectorLDC!(short8, 1, 3, 5, 7, 9, 11, 13, 15)(sa, sb);
892         return cast(__m128i)vqsubq_s16(c, d);
893     }
894     else
895     {
896         short8 sa = cast(short8)a;
897         short8 sb = cast(short8)b;
898         short8 r;
899         r.ptr[0] = saturateSignedIntToSignedShort(sa.array[0] - sa.array[1]);
900         r.ptr[1] = saturateSignedIntToSignedShort(sa.array[2] - sa.array[3]);
901         r.ptr[2] = saturateSignedIntToSignedShort(sa.array[4] - sa.array[5]);
902         r.ptr[3] = saturateSignedIntToSignedShort(sa.array[6] - sa.array[7]);
903         r.ptr[4] = saturateSignedIntToSignedShort(sb.array[0] - sb.array[1]);
904         r.ptr[5] = saturateSignedIntToSignedShort(sb.array[2] - sb.array[3]);
905         r.ptr[6] = saturateSignedIntToSignedShort(sb.array[4] - sb.array[5]);
906         r.ptr[7] = saturateSignedIntToSignedShort(sb.array[6] - sb.array[7]);
907         return cast(__m128i)r;
908     }
909 }
910 unittest
911 {
912     __m128i A = _mm_setr_epi16(1, -2, 4, 8, 32767, -1, -10, 32767);
913     short8 C = cast(short8) _mm_hsubs_epi16(A, A);
914     short[8] correct = [ 3, -4, 32767, -32768, 3, -4, 32767, -32768 ];
915     assert(C.array == correct);
916 }
917 
918 
919 /// Horizontally subtract adjacent pairs of signed 16-bit integers in `a` and `b` using saturation, 
920 /// and pack the signed 16-bit results.
921 __m64 _mm_hsubs_pi16 (__m64 a, __m64 b) @trusted
922 {
923     static if (GDC_with_SSSE3)
924     {
925         return cast(__m64)__builtin_ia32_phsubsw(cast(short4)a, cast(short4)b);
926     }
927     else static if (LDC_with_SSSE3)
928     {
929         // Note: LDC doesn't have __builtin_ia32_phsubsw
930         long2 la;
931         la.ptr[0] = a.array[0];
932         long2 lb;
933         lb.ptr[0] = b.array[0];
934         int4 sum = cast(int4)__builtin_ia32_phsubsw128(cast(short8)la, cast(short8)lb);
935         int2 r;
936         r.ptr[0] = sum.array[0];
937         r.ptr[1] = sum.array[2];
938         return cast(__m64)r;
939     }
940     else static if (LDC_with_ARM64)
941     {
942         // uzp1/uzp2/sqsub sequence in -O1
943         short4 sa = cast(short4)a;
944         short4 sb = cast(short4)b;
945         short4 c = shufflevectorLDC!(short4, 0, 2, 4, 6)(sa, sb);
946         short4 d = shufflevectorLDC!(short4, 1, 3, 5, 7)(sa, sb);
947         return cast(__m64)vqsub_s16(c, d);
948     }
949     else
950     {
951         short4 sa = cast(short4)a;
952         short4 sb = cast(short4)b;
953         short4 r;
954         r.ptr[0] = saturateSignedIntToSignedShort(sa.array[0] - sa.array[1]);
955         r.ptr[1] = saturateSignedIntToSignedShort(sa.array[2] - sa.array[3]);
956         r.ptr[2] = saturateSignedIntToSignedShort(sb.array[0] - sb.array[1]);
957         r.ptr[3] = saturateSignedIntToSignedShort(sb.array[2] - sb.array[3]);
958         return cast(__m64)r;
959     }
960 }
961 unittest
962 {
963     __m64 A = _mm_setr_pi16(-16, 32, 100, -32768);
964     __m64 B = _mm_setr_pi16( 64, 30,   -9,  32767);
965     short4 C = cast(short4) _mm_hsubs_pi16(A, B);
966     short[4] correct = [ -48, 32767,  34,  -32768];
967     assert(C.array == correct);
968 }
969 
970 /// Vertically multiply each unsigned 8-bit integer from `a` with the corresponding 
971 /// signed 8-bit integer from `b`, producing intermediate signed 16-bit integers. 
972 /// Horizontally add adjacent pairs of intermediate signed 16-bit integers, 
973 /// and pack the saturated results.
974 __m128i _mm_maddubs_epi16 (__m128i a, __m128i b) @trusted
975 {
976     static if (GDC_with_SSSE3)
977     {
978         return cast(__m128i)__builtin_ia32_pmaddubsw128(cast(ubyte16)a, cast(ubyte16)b);
979     }
980     else static if (LDC_with_SSSE3)
981     {
982         return cast(__m128i)__builtin_ia32_pmaddubsw128(cast(byte16)a, cast(byte16)b);
983     }
984     else
985     {
986         // zero-extend a to 16-bit
987         __m128i zero = _mm_setzero_si128();
988         __m128i a_lo = _mm_unpacklo_epi8(a, zero);
989         __m128i a_hi = _mm_unpackhi_epi8(a, zero);
990 
991         // sign-extend b to 16-bit
992         __m128i b_lo = _mm_unpacklo_epi8(b, zero);
993         __m128i b_hi = _mm_unpackhi_epi8(b, zero);    
994         b_lo = _mm_srai_epi16( _mm_slli_epi16(b_lo, 8), 8);
995         b_hi = _mm_srai_epi16( _mm_slli_epi16(b_hi, 8), 8); 
996 
997         // Multiply element-wise, no overflow can occur
998         __m128i c_lo = _mm_mullo_epi16(a_lo, b_lo);  
999         __m128i c_hi = _mm_mullo_epi16(a_hi, b_hi);
1000 
1001         // Add pairwise with saturating horizontal add
1002         return _mm_hadds_epi16(c_lo, c_hi);
1003     }
1004 }
1005 unittest
1006 {
1007     __m128i A = _mm_setr_epi8(  -1,  10, 100, -128, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0); // u8
1008     __m128i B = _mm_setr_epi8(-128, -30, 100,  127, -1, 2, 4, 6, 0, 0, 0, 0, 0, 0, 0, 0); // i8
1009     short8 C = cast(short8) _mm_maddubs_epi16(A, B);
1010     short[8] correct =       [   -32768,     26256, 0, 0, 0, 0, 0, 0];
1011     assert(C.array == correct);
1012 }
1013 
1014 /// Vertically multiply each unsigned 8-bit integer from `a` with the corresponding 
1015 /// signed 8-bit integer from `b`, producing intermediate signed 16-bit integers. 
1016 /// Horizontally add adjacent pairs of intermediate signed 16-bit integers, 
1017 /// and pack the saturated results.
1018 __m64 _mm_maddubs_pi16 (__m64 a, __m64 b) @trusted
1019 {
1020     static if (GDC_with_SSSE3)
1021     {
1022         return cast(__m64)__builtin_ia32_pmaddubsw(cast(ubyte8)a, cast(ubyte8)b);
1023     }
1024     else static if (LDC_with_SSSE3)
1025     {
1026         __m128i A = to_m128i(a);
1027         __m128i B = to_m128i(b);
1028         return to_m64( cast(__m128i)__builtin_ia32_pmaddubsw128(cast(byte16) to_m128i(a), cast(byte16) to_m128i(b)));
1029     }
1030     else
1031     {
1032         // zero-extend a to 16-bit
1033         __m128i zero = _mm_setzero_si128();
1034         __m128i A = _mm_unpacklo_epi8(to_m128i(a), zero);
1035 
1036         // sign-extend b to 16-bit
1037         __m128i B = _mm_unpacklo_epi8(to_m128i(b), zero);    
1038         B = _mm_srai_epi16( _mm_slli_epi16(B, 8), 8);
1039 
1040         // Multiply element-wise, no overflow can occur
1041         __m128i c = _mm_mullo_epi16(A, B);
1042 
1043         // Add pairwise with saturating horizontal add
1044         return to_m64( _mm_hadds_epi16(c, zero));
1045     }
1046 }
1047 unittest
1048 {
1049     __m64 A = _mm_setr_pi8(  -1,  10, 100, -128, 0, 0, 0, 0); // u8
1050     __m64 B = _mm_setr_pi8(-128, -30, 100,  127, -1, 2, 4, 6); // i8
1051     short4 C = cast(short4) _mm_maddubs_pi16(A, B);
1052     short[4] correct =       [   -32768,   26256, 0, 0];
1053     assert(C.array == correct);
1054 }
1055 
1056 /// Multiply packed signed 16-bit integers in `a` and `b`, producing intermediate signed 32-bit integers.
1057 /// Truncate each intermediate integer to the 18 most significant bits, round by adding 1, and return bits `[16:1]`.
1058 __m128i _mm_mulhrs_epi16 (__m128i a, __m128i b) pure @trusted
1059 {
1060     // PERF DMD
1061     static if (GDC_with_SSSE3)
1062     {
1063         return cast(__m128i) __builtin_ia32_pmulhrsw128(cast(short8)a, cast(short8)b);
1064     }
1065     else static if (LDC_with_SSSE3)
1066     {
1067         return cast(__m128i) __builtin_ia32_pmulhrsw128(cast(short8)a, cast(short8)b);
1068     }
1069     else static if (LDC_with_ARM64)
1070     {
1071         int4 mul_lo = vmull_s16(vget_low_s16(cast(short8)a),
1072                                 vget_low_s16(cast(short8)b));
1073         int4 mul_hi = vmull_s16(vget_high_s16(cast(short8)a),
1074                                 vget_high_s16(cast(short8)b));
1075 
1076         // Rounding narrowing shift right
1077         // narrow = (int16_t)((mul + 16384) >> 15);
1078         short4 narrow_lo = vrshrn_n_s32(mul_lo, 15);
1079         short4 narrow_hi = vrshrn_n_s32(mul_hi, 15);
1080 
1081         // Join together.
1082         return cast(__m128i) vcombine_s16(narrow_lo, narrow_hi);
1083     }
1084     else
1085     {
1086         short8 sa = cast(short8)a;
1087         short8 sb = cast(short8)b;
1088         short8 r;
1089 
1090         for (int i = 0; i < 8; ++i)
1091         {
1092             // I doubted it at first, but an exhaustive search show this to be equivalent to Intel pseudocode.
1093             r.ptr[i] = cast(short) ( (sa.array[i] * sb.array[i] + 0x4000) >> 15);
1094         }
1095 
1096         return cast(__m128i)r;
1097     }
1098 }
1099 unittest
1100 {
1101     __m128i A = _mm_setr_epi16(12345, -32768, 32767, 0, 1, 845, -6999, -1);
1102     __m128i B = _mm_setr_epi16(8877, -24487, 15678, 32760, 1, 0, -149, -1);
1103     short8 C = cast(short8) _mm_mulhrs_epi16(A, B);
1104     short[8] correct = [3344, 24487, 15678, 0, 0, 0, 32, 0];
1105     assert(C.array == correct);
1106 }
1107 
1108 /// Multiply packed signed 16-bit integers in `a` and `b`, producing intermediate signed 32-bit integers.
1109 /// Truncate each intermediate integer to the 18 most significant bits, round by adding 1, and return bits `[16:1]`.
1110 __m64 _mm_mulhrs_pi16 (__m64 a, __m64 b) @trusted
1111 {
1112     // PERF DMD
1113     static if (GDC_with_SSSE3)
1114     {
1115         return cast(__m64) __builtin_ia32_pmulhrsw(cast(short4)a, cast(short4)b);
1116     }
1117     else static if (LDC_with_SSSE3)
1118     {
1119         return cast(__m64) to_m64( cast(__m128i) __builtin_ia32_pmulhrsw128(cast(short8) to_m128i(a), cast(short8) to_m128i(b)));
1120     }
1121     else static if (LDC_with_ARM64)
1122     {
1123         int4 mul = vmull_s16(cast(short4)a, cast(short4)b);
1124 
1125         // Rounding narrowing shift right
1126         // (int16_t)((mul + 16384) >> 15);
1127         return cast(__m64) vrshrn_n_s32(mul, 15);
1128     }
1129     else
1130     {
1131         short4 sa = cast(short4)a;
1132         short4 sb = cast(short4)b;
1133         short4 r;
1134 
1135         for (int i = 0; i < 4; ++i)
1136         {
1137             r.ptr[i] = cast(short) ( (sa.array[i] * sb.array[i] + 0x4000) >> 15);
1138         }
1139         return cast(__m64)r;
1140     }
1141 }
1142 unittest
1143 {
1144     __m64 A = _mm_setr_pi16(12345, -32768, 32767, 0);
1145     __m64 B = _mm_setr_pi16(8877, -24487, 15678, 32760);
1146     short4 C = cast(short4) _mm_mulhrs_pi16(A, B);
1147     short[4] correct = [3344, 24487, 15678, 0];
1148     assert(C.array == correct);
1149 }
1150 
1151 /// Shuffle packed 8-bit integers in `a` according to shuffle control mask in the corresponding 8-bit element of `b`.
1152 __m128i _mm_shuffle_epi8 (__m128i a, __m128i b) pure @trusted
1153 {
1154     // This is the lovely pshufb.
1155     // PERF DMD
1156     static if (GDC_with_SSSE3)
1157     {
1158         return cast(__m128i) __builtin_ia32_pshufb128(cast(ubyte16) a, cast(ubyte16) b);
1159     }
1160     else static if (LDC_with_SSSE3)
1161     {
1162         return cast(__m128i) __builtin_ia32_pshufb128(cast(byte16) a, cast(byte16) b);
1163     }
1164     else static if (LDC_with_ARM64)
1165     {
1166         byte16 bb = cast(byte16)b;
1167         byte16 mask;
1168         mask = cast(byte)(0x8F);
1169         bb = bb & mask;
1170         // "If an index is out of range for the table, the result for that lookup is 0."
1171         // So, having bit 7 in indices will yield 0 correctly.
1172         byte16 r = vqtbl1q_s8(cast(byte16)a, bb);
1173         return cast(__m128i)r;
1174     }
1175     else
1176     {
1177         byte16 r;
1178         byte16 ba = cast(byte16)a;
1179         byte16 bb = cast(byte16)b;
1180         for (int i = 0; i < 16; ++i)
1181         {
1182             byte s = bb.array[i];
1183             r.ptr[i] = (s < 0) ? 0 : ba.array[ s & 15 ];
1184         }
1185         return cast(__m128i)r;
1186     }
1187 }
1188 unittest
1189 {
1190     __m128i A = _mm_setr_epi8(15,   14,      13,  12, 11,  10, 9, 8, 7, 6,  5,  4,  3,  2,  1,  0);
1191     __m128i B = _mm_setr_epi8(15, -128, 13 + 16, -12, 11, -10, 9, 8, 7, 6, -5,  4,  3, -2,  1,  0);
1192     byte16 C = cast(byte16) _mm_shuffle_epi8(A, B);
1193     byte[16] correct =         [0,   0,       2,  0,  4,   0, 6, 7, 8, 9,  0, 11, 12,  0, 14, 15];
1194     assert(C.array == correct);
1195 }
1196 
1197 /// Shuffle packed 8-bit integers in `a` according to shuffle control mask in the corresponding 8-bit element of `b`.
1198 __m64 _mm_shuffle_pi8 (__m64 a, __m64 b) @trusted
1199 {
1200     // PERF DMD
1201     static if (GDC_with_SSSE3)
1202     {
1203         alias ubyte8  =__vector(ubyte[8]);
1204         return cast(__m64) __builtin_ia32_pshufb(cast(ubyte8) a, cast(ubyte8) b);
1205     }
1206     else static if (LDC_with_SSSE3)
1207     {
1208         // GDC does proper dance to avoid mmx registers, do it manually in LDC since __builtin_ia32_pshufb doesn't exist there
1209         __m128i A = to_m128i(a);
1210         __m128i index = to_m128i(b);
1211         index = index & _mm_set1_epi32(0xF7F7F7F7);
1212         return to_m64( cast(__m128i) __builtin_ia32_pshufb128(cast(byte16)A, cast(byte16) index) );
1213     }
1214     else static if (LDC_with_ARM64)
1215     {
1216         byte8 bb = cast(byte8)b;
1217         byte8 mask;
1218         mask = cast(byte)(0x87);
1219         bb = bb & mask;
1220         __m128i l = to_m128i(a);
1221         byte8 r = vtbl1_s8(cast(byte16)l, cast(byte8)bb);
1222         return cast(__m64)r;
1223     }
1224     else
1225     {
1226         byte8 r;
1227         byte8 ba = cast(byte8)a;
1228         byte8 bb = cast(byte8)b;
1229         for (int i = 0; i < 8; ++i)
1230         {
1231             byte s = bb.array[i];
1232             r.ptr[i] = (s < 0) ? 0 : ba.array[ s & 7 ];
1233         }
1234         return cast(__m64)r;
1235     }
1236 }
1237 unittest
1238 {
1239     __m64 A = _mm_setr_pi8(7,  6,  5,  4,      3,  2,  1,  0);
1240     __m64 B = _mm_setr_pi8(7,  6, -5,  4,  3 + 8, -2,  1,  0);
1241     byte8 C = cast(byte8) _mm_shuffle_pi8(A, B);
1242     byte[8] correct =    [0,  1,  0,  3,      4,  0,  6,  7];
1243     assert(C.array == correct);
1244 }
1245 
1246 /// Negate packed 16-bit integers in `a` when the corresponding signed 16-bit integer in `b` is negative.
1247 /// Elements in result are zeroed out when the corresponding element in `b` is zero.
1248 __m128i _mm_sign_epi16 (__m128i a, __m128i b) pure @safe
1249 {
1250     // PERF DMD
1251     static if (GDC_with_SSSE3)
1252     {
1253         return cast(__m128i) __builtin_ia32_psignw128(cast(short8)a, cast(short8)b);
1254     }
1255     else static if (LDC_with_SSSE3)
1256     {
1257         return cast(__m128i) __builtin_ia32_psignw128(cast(short8)a, cast(short8)b);       
1258     }
1259     else
1260     {
1261         // LDC arm64: 5 instructions
1262         __m128i mask = _mm_srai_epi16(b, 15);
1263         __m128i zeromask = _mm_cmpeq_epi16(b, _mm_setzero_si128());
1264         return _mm_andnot_si128(zeromask, _mm_xor_si128(_mm_add_epi16(a, mask), mask));
1265     }
1266 }
1267 unittest
1268 {
1269     __m128i A = _mm_setr_epi16(-2, -1, 0, 1,  2, short.min, short.min, short.min);
1270     __m128i B = _mm_setr_epi16(-1,  0,-1, 1, -2,       -50,         0,        50);
1271     short8 C = cast(short8) _mm_sign_epi16(A, B);
1272     short[8] correct =        [ 2,  0, 0, 1, -2, short.min,         0, short.min];
1273     assert(C.array == correct);
1274 }
1275 
1276 /// Negate packed 32-bit integers in `a` when the corresponding signed 32-bit integer in `b` is negative. 
1277 /// Elements in result are zeroed out when the corresponding element in `b` is zero.
1278 __m128i _mm_sign_epi32 (__m128i a, __m128i b) pure @safe
1279 {
1280     // PERF DMD
1281     static if (GDC_with_SSSE3)
1282     {
1283         return cast(__m128i) __builtin_ia32_psignd128(cast(short8)a, cast(short8)b);
1284     }
1285     else static if (LDC_with_SSSE3)
1286     {
1287         return cast(__m128i) __builtin_ia32_psignd128(cast(short8)a, cast(short8)b);
1288     }
1289     else
1290     {
1291         __m128i mask = _mm_srai_epi32(b, 31);
1292         __m128i zeromask = _mm_cmpeq_epi32(b, _mm_setzero_si128());
1293         return _mm_andnot_si128(zeromask, _mm_xor_si128(_mm_add_epi32(a, mask), mask));
1294     }
1295 }
1296 unittest
1297 {
1298     __m128i A = _mm_setr_epi32(-2, -1,  0, int.max);
1299     __m128i B = _mm_setr_epi32(-1,  0, -1, 1);
1300     int4 C = cast(int4) _mm_sign_epi32(A, B);
1301     int[4] correct =          [ 2,  0, 0, int.max];
1302     assert(C.array == correct);
1303 }
1304 
1305 /// Negate packed 8-bit integers in `a` when the corresponding signed 8-bit integer in `b` is negative. 
1306 /// Elements in result are zeroed out when the corresponding element in `b` is zero.
1307 __m128i _mm_sign_epi8 (__m128i a, __m128i b) pure @safe
1308 {
1309     // PERF DMD
1310     static if (GDC_with_SSSE3)
1311     {
1312         return cast(__m128i) __builtin_ia32_psignb128(cast(ubyte16)a, cast(ubyte16)b);
1313     }
1314     else static if (LDC_with_SSSE3)
1315     {
1316         return cast(__m128i) __builtin_ia32_psignb128(cast(byte16)a, cast(byte16)b);
1317     }
1318     else
1319     {
1320         __m128i mask = _mm_cmplt_epi8(b, _mm_setzero_si128()); // extend sign bit
1321         __m128i zeromask = _mm_cmpeq_epi8(b, _mm_setzero_si128());
1322         return _mm_andnot_si128(zeromask, _mm_xor_si128(_mm_add_epi8(a, mask), mask));
1323     }
1324 }
1325 unittest
1326 {
1327     __m128i A = _mm_setr_epi8(-2, -1, 0, 1,  2, byte.min, byte.min, byte.min, -1,  0,-1, 1, -2,      -50,        0,       50);
1328     __m128i B = _mm_setr_epi8(-1,  0,-1, 1, -2,      -50,        0,       50, -2, -1, 0, 1,  2, byte.min, byte.min, byte.min);
1329     byte16  C = cast(byte16) _mm_sign_epi8(A, B);
1330     byte[16] correct =       [ 2,  0, 0, 1, -2, byte.min,        0, byte.min,  1,  0, 0, 1, -2,       50,        0,      -50];
1331     assert(C.array == correct);
1332 }
1333 
1334 /// Negate packed 16-bit integers in `a`  when the corresponding signed 16-bit integer in `b` is negative.
1335 /// Element in result are zeroed out when the corresponding element in `b` is zero.
1336 __m64 _mm_sign_pi16 (__m64 a, __m64 b) @trusted
1337 {
1338     return to_m64( _mm_sign_epi16( to_m128i(a), to_m128i(b)) );
1339 }
1340 unittest
1341 {
1342     __m64 A = _mm_setr_pi16( 2, short.min, short.min, short.min);
1343     __m64 B = _mm_setr_pi16(-2,       -50,         0,        50);
1344     short4 C = cast(short4) _mm_sign_pi16(A, B);
1345     short[4] correct =     [-2, short.min,         0, short.min];
1346     assert(C.array == correct);
1347 }
1348 
1349 /// Negate packed 32-bit integers in `a`  when the corresponding signed 32-bit integer in `b` is negative.
1350 /// Element in result are zeroed out when the corresponding element in `b` is zero.
1351 __m64 _mm_sign_pi32 (__m64 a, __m64 b) @trusted
1352 {
1353     return to_m64( _mm_sign_epi32( to_m128i(a), to_m128i(b)) );
1354 }
1355 unittest
1356 {
1357     __m64 A = _mm_setr_pi32(-2, -100);
1358     __m64 B = _mm_setr_pi32(-1,  0);
1359     int2 C = cast(int2) _mm_sign_pi32(A, B);
1360     int[2] correct =          [ 2,  0];
1361     assert(C.array == correct);
1362 }
1363 
1364 /// Negate packed 8-bit integers in `a` when the corresponding signed 8-bit integer in `b` is negative. 
1365 /// Elements in result are zeroed out when the corresponding element in `b` is zero.
1366 __m64 _mm_sign_pi8 (__m64 a, __m64 b) @trusted
1367 {
1368     return to_m64( _mm_sign_epi8( to_m128i(a), to_m128i(b)) );
1369 }
1370 unittest
1371 {
1372     __m64 A = _mm_setr_pi8(-2, -1, 0, 1,  2, byte.min, byte.min, byte.min);
1373     __m64 B = _mm_setr_pi8(-1,  0,-1, 1, -2,      -50,        0,       50);
1374     byte8  C = cast(byte8) _mm_sign_pi8(A, B);
1375     byte[8] correct =     [ 2,  0, 0, 1, -2, byte.min,        0, byte.min];
1376     assert(C.array == correct);
1377 }