1 /**
2 * SSSE3 intrinsics.
3 *
4 * Copyright: Guillaume Piolat 2021.
5 *            Johan Engelen 2021.
6 * License:   $(LINK2 http://www.boost.org/LICENSE_1_0.txt, Boost License 1.0)
7 */
8 module inteli.tmmintrin;
9 
10 public import inteli.types;
11 import inteli.internals;
12 
13 public import inteli.pmmintrin;
14 import inteli.mmx;
15 
16 nothrow @nogc:
17 
18 
19 // SSSE3 instructions
20 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#techs=SSSE3
21 // Note: this header will work whether you have SSSE3 enabled or not.
22 // With LDC, use "dflags-ldc": ["-mattr=+ssse3"] or equivalent to actively 
23 // generate SSE3 instructions.
24 
25 /// Compute the absolute value of packed signed 16-bit integers in `a`.
26 __m128i _mm_abs_epi16 (__m128i a) @trusted
27 {
28     static if (DMD_with_DSIMD)
29     {
30         return cast(__m128i)__simd(XMM.PABSW, a);
31     }
32     else static if (GDC_with_SSSE3)
33     {
34         return cast(__m128i) __builtin_ia32_pabsw128(cast(short8)a);
35     }
36     else static if (LDC_with_ARM64)
37     {
38         return cast(__m128i) vabsq_s16(cast(short8)a);
39     }
40     else
41     {
42         // LDC x86: generate pabsw since LDC 1.1 -O2
43         short8 sa = cast(short8)a;
44         for (int i = 0; i < 8; ++i)
45         {
46             short s = sa.array[i];
47             sa.ptr[i] = s >= 0 ? s : cast(short)(-cast(int)(s));
48         }  
49         return cast(__m128i)sa;
50     }
51 }
52 unittest
53 {
54     __m128i A = _mm_setr_epi16(0, -1, -32768, 32767, 10, -10, 1000, -1000);
55     short8 B = cast(short8) _mm_abs_epi16(A);
56     short[8] correct = [0, 1, -32768, 32767, 10, 10, 1000, 1000];
57     assert(B.array == correct);
58 }
59 
60 /// Compute the absolute value of packed signed 32-bit integers in `a`.
61 __m128i _mm_abs_epi32 (__m128i a) @trusted
62 {
63     static if (DMD_with_DSIMD)
64     {
65         return cast(__m128i)__simd(XMM.PABSD, cast(int4)a);
66     }
67     else static if (GDC_with_SSSE3)
68     {
69         return cast(__m128i) __builtin_ia32_pabsd128(cast(int4)a);
70     }
71     else static if (LDC_with_ARM64)
72     {
73         return cast(__m128i) vabsq_s32(cast(int4)a);
74     }
75     else
76     {
77         // LDC x86: generates pabsd since LDC 1.1 -O2
78         int4 sa = cast(int4)a;
79         for (int i = 0; i < 4; ++i)
80         {
81             int s = sa.array[i];
82             sa.ptr[i] = s >= 0 ? s : -s;
83         }  
84         return cast(__m128i)sa;
85     } 
86 }
87 unittest
88 {
89     __m128i A = _mm_setr_epi32(0, -1, -2_147_483_648, -2_147_483_647);
90     int4 B = cast(int4) _mm_abs_epi32(A);
91     int[4] correct = [0, 1, -2_147_483_648, 2_147_483_647];
92     assert(B.array == correct);
93 }
94 
95 /// Compute the absolute value of packed signed 8-bit integers in `a`.
96 __m128i _mm_abs_epi8 (__m128i a) @trusted
97 {
98     static if (DMD_with_DSIMD)
99     {
100         return cast(__m128i)__simd(XMM.PABSB, cast(byte16)a);
101     }
102     else static if (GDC_with_SSSE3)
103     {
104         alias ubyte16 = __vector(ubyte[16]);
105         return cast(__m128i) __builtin_ia32_pabsb128(cast(ubyte16)a);
106     }
107     else static if (LDC_with_ARM64)
108     {
109         return cast(__m128i) vabsq_s8(cast(byte16)a);
110     }
111     else version(LDC)
112     {
113         // LDC x86: generates pabsb since LDC 1.1 -O1
114         //     arm64: generates abs since LDC 1.8 -O1
115         enum ir = `
116                 %n = sub <16 x i8> <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, %0
117                 %s = icmp slt <16 x i8> <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, %0
118                 %r = select <16 x i1> %s, <16 x i8> %0, <16 x i8> %n
119                 ret <16 x i8> %r`;
120         return cast(__m128i) LDCInlineIR!(ir, byte16, byte16)(cast(byte16)a);
121     }
122     else
123     {
124         // A loop version like in _mm_abs_epi16/_mm_abs_epi32 would be very slow 
125         // in LDC x86 and wouldn't vectorize. Doesn't generate pabsb in LDC though.
126         return _mm_min_epu8(a, _mm_sub_epi8(_mm_setzero_si128(), a));
127     }
128 }
129 unittest
130 {
131     __m128i A = _mm_setr_epi8(0, -1, -128, -127, 127, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
132     byte16 B = cast(byte16) _mm_abs_epi8(A);
133     byte[16] correct =       [0,  1, -128,  127, 127, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0];
134     assert(B.array == correct);
135 }
136 
137 /// Compute the absolute value of packed signed 16-bit integers in `a`.
138 __m64 _mm_abs_pi16 (__m64 a) @trusted
139 {
140     return to_m64(_mm_abs_epi16(to_m128i(a)));
141 }
142 unittest
143 {
144     __m64 A = _mm_setr_pi16(0, -1, -32768, 32767);
145     short4 B = cast(short4) _mm_abs_pi16(A);
146     short[4] correct = [0, 1, -32768, 32767];
147     assert(B.array == correct);
148 }
149 
150 /// Compute the absolute value of packed signed 32-bit integers in `a`.
151 __m64 _mm_abs_pi32 (__m64 a) @trusted
152 {
153      return to_m64(_mm_abs_epi32(to_m128i(a)));
154 }
155 unittest
156 {
157     __m64 A = _mm_setr_pi32(-1, -2_147_483_648);
158     int2 B = cast(int2) _mm_abs_pi32(A);
159     int[2] correct = [1, -2_147_483_648];
160     assert(B.array == correct);
161 }
162 
163 /// Compute the absolute value of packed signed 8-bit integers in `a`.
164 __m64 _mm_abs_pi8 (__m64 a) @trusted
165 {
166     return to_m64(_mm_abs_epi8(to_m128i(a)));
167 }
168 unittest
169 {
170     __m64 A = _mm_setr_pi8(0, -1, -128, -127, 127, 0, 0, 0);
171     byte8 B = cast(byte8) _mm_abs_pi8(A);
172     byte[8] correct =       [0,  1, -128,  127, 127, 0, 0, 0];
173     assert(B.array == correct);
174 }
175 
176 /// Concatenate 16-byte blocks in `a` and `b` into a 32-byte temporary result, shift the result right by `count` bytes, and return the low 16 bytes.
177 __m128i _mm_alignr_epi8(ubyte count)(__m128i a, __m128i b) @trusted
178 {
179     // PERF DMD
180     static if (GDC_with_SSSE3)
181     {
182         return cast(__m128i)__builtin_ia32_palignr128(cast(long2)a, cast(long2)b, count * 8);
183     }
184     else version(LDC)
185     {
186         static if (count >= 32)
187         {
188             return _mm_setzero_si128();
189         }
190         else static if (count < 16)
191         {
192             // Generates palignr since LDC 1.1 -O1
193             // Also generates a single ext instruction on arm64.
194             return cast(__m128i) shufflevector!(byte16, ( 0 + count),
195                                                         ( 1 + count),
196                                                         ( 2 + count),
197                                                         ( 3 + count),
198                                                         ( 4 + count),
199                                                         ( 5 + count),
200                                                         ( 6 + count),
201                                                         ( 7 + count),
202                                                         ( 8 + count),
203                                                         ( 9 + count),
204                                                         (10 + count),
205                                                         (11 + count),
206                                                         (12 + count),
207                                                         (13 + count),
208                                                         (14 + count),
209                                                         (15 + count))(cast(byte16)b, cast(byte16)a);
210         }
211         else
212         {
213             return cast(__m128i) shufflevector!(byte16, ( 0 + count) % 32,
214                                                         ( 1 + count) % 32,
215                                                         ( 2 + count) % 32,
216                                                         ( 3 + count) % 32,
217                                                         ( 4 + count) % 32,
218                                                         ( 5 + count) % 32,
219                                                         ( 6 + count) % 32,
220                                                         ( 7 + count) % 32,
221                                                         ( 8 + count) % 32,
222                                                         ( 9 + count) % 32,
223                                                         (10 + count) % 32,
224                                                         (11 + count) % 32,
225                                                         (12 + count) % 32,
226                                                         (13 + count) % 32,
227                                                         (14 + count) % 32,
228                                                         (15 + count) % 32)(cast(byte16)_mm_setzero_si128(), cast(byte16)a);
229         }
230     }
231     else
232     {
233         byte16 ab = cast(byte16)a;
234         byte16 bb = cast(byte16)b;
235         byte16 r;
236 
237         for (int i = 0; i < 16; ++i)
238         {
239             const int srcpos = count + cast(int)i;
240             if (srcpos > 31) 
241             {
242                 r.ptr[i] = 0;
243             } 
244             else if (srcpos > 15) 
245             {
246                 r.ptr[i] = ab.array[(srcpos) & 15];
247             } 
248             else 
249             {
250                 r.ptr[i] = bb.array[srcpos];
251             }
252        }
253        return cast(__m128i)r;
254     }
255 }
256 unittest
257 {
258     __m128i A = _mm_setr_epi8(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
259     __m128i B = _mm_setr_epi8(17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32);
260 
261     {
262         byte16 C = cast(byte16)_mm_alignr_epi8!0(A ,B);
263         byte[16] correct = [17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32];
264         assert(C.array == correct);
265     }
266     {
267         byte16 C = cast(byte16)_mm_alignr_epi8!20(A ,B);
268         byte[16] correct = [5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 0, 0, 0, 0];
269         assert(C.array == correct);
270     }
271     {
272         byte16 C = cast(byte16)_mm_alignr_epi8!34(A ,B);
273         byte[16] correct = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0];
274         assert(C.array == correct);
275     }
276 
277     __m128i D = _mm_setr_epi8(-123, -82, 103, -69, 103, -26, 9, 106, 58, -11, 79, -91, 114, -13, 110, 60);
278     __m128i E = _mm_setr_epi8(25, -51, -32, 91, -85, -39, -125, 31, -116, 104, 5, -101, 127, 82, 14, 81);
279     byte16 F = cast(byte16)_mm_alignr_epi8!8(D, E);
280     byte[16] correct = [-116, 104, 5, -101, 127, 82, 14, 81, -123, -82, 103, -69, 103, -26, 9, 106];
281     assert(F.array == correct);
282 }
283 
284 /// Concatenate 8-byte blocks in `a` and `b` into a 16-byte temporary result, shift the result right by `count` bytes, and return the low 8 bytes.
285 __m64 _mm_alignr_pi8(ubyte count)(__m64 a, __m64 b) @trusted
286 {
287     // PERF DMD
288     static if (GDC_with_SSSE3)
289     {
290         return cast(__m64)__builtin_ia32_palignr(cast(long1)a, cast(long1)b, count * 8);
291     }
292     else version(LDC)
293     {
294         static if (count >= 16)
295         {
296             return _mm_setzero_si64();
297         }
298         else static if (count < 8)
299         {
300             // Note: in LDC x86 this uses a pshufb.
301             // Generates ext in arm64.
302             return cast(__m64) shufflevector!(byte8, (0 + count),
303                                                      (1 + count),
304                                                      (2 + count),
305                                                      (3 + count),
306                                                      (4 + count),
307                                                      (5 + count),
308                                                      (6 + count),
309                                                      (7 + count))(cast(byte8)b, cast(byte8)a);
310         }
311         else
312         {
313             return cast(__m64) shufflevector!(byte8, (0 + count)%16,
314                                                      (1 + count)%16,
315                                                      (2 + count)%16,
316                                                      (3 + count)%16,
317                                                      (4 + count)%16,
318                                                      (5 + count)%16,
319                                                      (6 + count)%16,
320                                                      (7 + count)%16)(cast(byte8)_mm_setzero_si64(), cast(byte8)a);
321         }
322     }
323     else
324     {
325         byte8 ab = cast(byte8)a;
326         byte8 bb = cast(byte8)b;
327         byte8 r;
328 
329         for (int i = 0; i < 8; ++i)
330         {
331             const int srcpos = count + cast(int)i;
332             if (srcpos > 15) 
333             {
334                 r.ptr[i] = 0;
335             } 
336             else if (srcpos > 7) 
337             {
338                 r.ptr[i] = ab.array[(srcpos) & 7];
339             } 
340             else 
341             {
342                 r.ptr[i] = bb.array[srcpos];
343             }
344        }
345        return cast(__m64)r;
346     }
347 }
348 unittest
349 {
350     __m64 A = _mm_setr_pi8(1, 2, 3, 4, 5, 6, 7, 8);
351     __m64 B = _mm_setr_pi8(17, 18, 19, 20, 21, 22, 23, 24);
352 
353     {
354         byte8 C = cast(byte8)_mm_alignr_pi8!0(A ,B);
355         byte[8] correct = [17, 18, 19, 20, 21, 22, 23, 24];
356         assert(C.array == correct);
357     }
358 
359     {
360         byte8 C = cast(byte8)_mm_alignr_pi8!3(A ,B);
361         byte[8] correct = [ 20, 21, 22, 23, 24, 1, 2, 3];
362         assert(C.array == correct);
363     }
364     {
365         byte8 C = cast(byte8)_mm_alignr_pi8!11(A ,B);
366         byte[8] correct = [4, 5, 6, 7, 8, 0, 0, 0];
367         assert(C.array == correct);
368     }
369     {
370         byte8 C = cast(byte8)_mm_alignr_pi8!17(A ,B);
371         byte[8] correct = [0, 0, 0, 0, 0, 0, 0, 0];
372         assert(C.array == correct);
373     }
374 }
375 
376 /// Horizontally add adjacent pairs of 16-bit integers in `a` and `b`, and pack the signed 16-bit results.
377 __m128i _mm_hadd_epi16 (__m128i a, __m128i b) @trusted
378 {
379     // PERF DMD
380     static if (GDC_with_SSSE3)
381     {
382         return cast(__m128i)__builtin_ia32_phaddw128(cast(short8)a, cast(short8)b);
383     }
384     else static if (LDC_with_SSSE3)
385     {
386         return cast(__m128i)__builtin_ia32_phaddw128(cast(short8)a, cast(short8)b);
387     }
388     else static if (LDC_with_ARM64)
389     {
390         return cast(__m128i)vpaddq_s16(cast(short8)a, cast(short8)b);
391     }
392     else
393     {
394         short8 sa = cast(short8)a;
395         short8 sb = cast(short8)b;
396         short8 r;
397         r.ptr[0] = cast(short)(sa.array[0] + sa.array[1]);
398         r.ptr[1] = cast(short)(sa.array[2] + sa.array[3]);
399         r.ptr[2] = cast(short)(sa.array[4] + sa.array[5]);
400         r.ptr[3] = cast(short)(sa.array[6] + sa.array[7]);
401         r.ptr[4] = cast(short)(sb.array[0] + sb.array[1]);
402         r.ptr[5] = cast(short)(sb.array[2] + sb.array[3]);
403         r.ptr[6] = cast(short)(sb.array[4] + sb.array[5]);
404         r.ptr[7] = cast(short)(sb.array[6] + sb.array[7]);
405         return cast(__m128i)r;
406     }
407 }
408 unittest
409 {
410     __m128i A = _mm_setr_epi16(1, -2, 4, 8, 16, 32, -1, -32768);
411     short8 C = cast(short8) _mm_hadd_epi16(A, A);
412     short[8] correct = [ -1, 12, 48, 32767, -1, 12, 48, 32767];
413     assert(C.array == correct);
414 }
415 
416 /// Horizontally add adjacent pairs of 32-bit integers in `a` and `b`, and pack the signed 32-bit results.
417 __m128i _mm_hadd_epi32 (__m128i a, __m128i b) @trusted
418 { 
419     // PERF DMD
420     static if (GDC_with_SSSE3)
421     {
422         return cast(__m128i)__builtin_ia32_phaddd128(cast(int4)a, cast(int4)b);
423     }
424     else static if (LDC_with_SSSE3)
425     {
426         return cast(__m128i)__builtin_ia32_phaddd128(cast(int4)a, cast(int4)b);
427     }
428     else static if (LDC_with_ARM64)
429     {
430         return cast(__m128i)vpaddq_s32(cast(int4)a, cast(int4)b);
431     }
432     else
433     {
434         int4 ia = cast(int4)a;
435         int4 ib = cast(int4)b;
436         int4 r;
437         r.ptr[0] = ia.array[0] + ia.array[1];
438         r.ptr[1] = ia.array[2] + ia.array[3];
439         r.ptr[2] = ib.array[0] + ib.array[1];
440         r.ptr[3] = ib.array[2] + ib.array[3];
441         return cast(__m128i)r;
442     }
443 }
444 unittest
445 {
446     __m128i A = _mm_setr_epi32(1, -2, int.min, -1);
447     __m128i B = _mm_setr_epi32(1, int.max, 4, -4);
448     int4 C = cast(int4) _mm_hadd_epi32(A, B);
449     int[4] correct = [ -1, int.max, int.min, 0 ];
450     assert(C.array == correct);
451 }
452 
453 /// Horizontally add adjacent pairs of 16-bit integers in `a` and `b`, and pack the signed 16-bit results.
454 __m64 _mm_hadd_pi16 (__m64 a, __m64 b) @trusted
455 {
456     // PERF DMD
457     static if (GDC_with_SSSE3)
458     {
459         return cast(__m64) __builtin_ia32_phaddw(cast(short4)a, cast(short4)b);
460     }
461     else static if (LDC_with_ARM64)
462     {
463         return cast(__m64) vpadd_s16(cast(short4)a, cast(short4)b);
464     }
465     else
466     {
467         // LDC x86: generates phaddw since LDC 1.24 -O2.
468         short4 r;
469         short4 sa = cast(short4)a;
470         short4 sb = cast(short4)b;
471         r.ptr[0] = cast(short)(sa.array[0] + sa.array[1]); 
472         r.ptr[1] = cast(short)(sa.array[2] + sa.array[3]);
473         r.ptr[2] = cast(short)(sb.array[0] + sb.array[1]);
474         r.ptr[3] = cast(short)(sb.array[2] + sb.array[3]);
475         return cast(__m64)r;
476     }
477 }
478 unittest
479 {
480     __m64 A = _mm_setr_pi16(1, -2, 4, 8);
481     __m64 B = _mm_setr_pi16(16, 32, -1, -32768);
482     short4 C = cast(short4) _mm_hadd_pi16(A, B);
483     short[4] correct = [ -1, 12, 48, 32767 ];
484     assert(C.array == correct);
485 }
486 
487 /// Horizontally add adjacent pairs of 32-bit integers in `a` and `b`, 
488 /// and pack the signed 32-bit results.
489 __m64 _mm_hadd_pi32 (__m64 a, __m64 b) @trusted
490 {
491     // PERF DMD
492     static if (GDC_with_SSSE3)
493     {
494         return cast(__m64) __builtin_ia32_phaddd(cast(int2)a, cast(int2)b);
495     }
496     else static if (LDC_with_ARM64)
497     {
498         return cast(__m64)vpadd_s32(cast(int2)a, cast(int2)b);
499     }
500     else
501     {
502         // LDC x86: generates phaddd since LDC 1.24 -O2
503         int2 ia = cast(int2)a;
504         int2 ib = cast(int2)b;
505         int2 r;
506         r.ptr[0] = ia.array[0] + ia.array[1];
507         r.ptr[1] = ib.array[0] + ib.array[1];
508         return cast(__m64)r;
509     }
510 }
511 unittest
512 {
513     __m64 A = _mm_setr_pi32(int.min, -1);
514     __m64 B = _mm_setr_pi32(1, int.max);
515     int2 C = cast(int2) _mm_hadd_pi32(A, B);
516     int[2] correct = [ int.max, int.min ];
517     assert(C.array == correct);
518 }
519 
520 /// Horizontally add adjacent pairs of signed 16-bit integers in `a` and `b` using saturation, 
521 /// and pack the signed 16-bit results.
522 __m128i _mm_hadds_epi16 (__m128i a, __m128i b) @trusted
523 {
524      // PERF DMD
525     static if (GDC_with_SSSE3)
526     {
527         return cast(__m128i)__builtin_ia32_phaddsw128(cast(short8)a, cast(short8)b);
528     }
529     else static if (LDC_with_SSSE3)
530     {
531         return cast(__m128i)__builtin_ia32_phaddsw128(cast(short8)a, cast(short8)b);
532     }
533     else static if (LDC_with_ARM64)
534     {
535         // uzp1/uzp2/sqadd sequence
536         short8 sa = cast(short8)a;
537         short8 sb = cast(short8)b;
538         short8 c = shufflevector!(short8, 0, 2, 4, 6, 8, 10, 12, 14)(sa, sb);
539         short8 d = shufflevector!(short8, 1, 3, 5, 7, 9, 11, 13, 15)(sa, sb);
540         return cast(__m128i)vqaddq_s16(c, d);
541     }
542     else
543     {
544         short8 sa = cast(short8)a;
545         short8 sb = cast(short8)b;
546         short8 r;
547         r.ptr[0] = saturateSignedIntToSignedShort(sa.array[0] + sa.array[1]);
548         r.ptr[1] = saturateSignedIntToSignedShort(sa.array[2] + sa.array[3]);
549         r.ptr[2] = saturateSignedIntToSignedShort(sa.array[4] + sa.array[5]);
550         r.ptr[3] = saturateSignedIntToSignedShort(sa.array[6] + sa.array[7]);
551         r.ptr[4] = saturateSignedIntToSignedShort(sb.array[0] + sb.array[1]);
552         r.ptr[5] = saturateSignedIntToSignedShort(sb.array[2] + sb.array[3]);
553         r.ptr[6] = saturateSignedIntToSignedShort(sb.array[4] + sb.array[5]);
554         r.ptr[7] = saturateSignedIntToSignedShort(sb.array[6] + sb.array[7]);
555         return cast(__m128i)r;
556     }
557 }
558 unittest
559 {
560     __m128i A = _mm_setr_epi16(1, -2, 4, 8, 16, 32, -1, -32768);
561     short8 C = cast(short8) _mm_hadds_epi16(A, A);
562     short[8] correct = [ -1, 12, 48, -32768, -1, 12, 48, -32768];
563     assert(C.array == correct);
564 }
565 
566 /// Horizontally add adjacent pairs of signed 16-bit integers in `a` and `b` using saturation, 
567 /// and pack the signed 16-bit results.
568 __m64 _mm_hadds_pi16 (__m64 a, __m64 b) @trusted
569 {
570     static if (GDC_with_SSSE3)
571     {
572         return cast(__m64)__builtin_ia32_phaddsw(cast(short4)a, cast(short4)b);
573     }
574     else static if (LDC_with_SSSE3)
575     {
576         // Note: LDC doesn't have __builtin_ia32_phaddsw
577         long2 la;
578         la.ptr[0] = a.array[0];
579         long2 lb;
580         lb.ptr[0] = b.array[0];
581         int4 sum = cast(int4)__builtin_ia32_phaddsw128(cast(short8)la, cast(short8)lb);
582         int2 r;
583         r.ptr[0] = sum.array[0];
584         r.ptr[1] = sum.array[2];
585         return cast(__m64)r;
586     }
587     else static if (LDC_with_ARM64)
588     {
589         // uzp1/uzp2/sqadd sequence
590         short4 sa = cast(short4)a;
591         short4 sb = cast(short4)b;
592         short4 c = shufflevector!(short4, 0, 2, 4, 6)(sa, sb);
593         short4 d = shufflevector!(short4, 1, 3, 5, 7)(sa, sb);
594         return cast(__m64)vqadd_s16(c, d);
595     }
596     else
597     {
598         short4 sa = cast(short4)a;
599         short4 sb = cast(short4)b;
600         short4 r;
601         r.ptr[0] = saturateSignedIntToSignedShort(sa.array[0] + sa.array[1]);
602         r.ptr[1] = saturateSignedIntToSignedShort(sa.array[2] + sa.array[3]);
603         r.ptr[2] = saturateSignedIntToSignedShort(sb.array[0] + sb.array[1]);
604         r.ptr[3] = saturateSignedIntToSignedShort(sb.array[2] + sb.array[3]);
605         return cast(__m64)r;
606     }
607 }
608 unittest
609 {
610     __m64 A = _mm_setr_pi16(-16, 32, -100, -32768);
611     __m64 B = _mm_setr_pi16( 64, 32,    1,  32767);
612     short4 C = cast(short4) _mm_hadds_pi16(A, B);
613     short[4] correct = [ 16, -32768,  96,  32767];
614     assert(C.array == correct);
615 }
616 
617 
618 /// Horizontally add adjacent pairs of 16-bit integers in `a` and `b`, and pack the signed 16-bit results.
619 __m128i _mm_hsub_epi16 (__m128i a, __m128i b) @trusted
620 {
621     // PERF DMD
622     static if (GDC_with_SSSE3)
623     {
624         return cast(__m128i)__builtin_ia32_phsubw128(cast(short8)a, cast(short8)b);
625     }
626     else static if (LDC_with_SSSE3)
627     {
628         return cast(__m128i)__builtin_ia32_phsubw128(cast(short8)a, cast(short8)b);
629     }
630     else static if (LDC_with_ARM64)
631     {
632         // Produce uzp1 uzp2 sub sequence since LDC 1.8 -O1 
633         short8 sa = cast(short8)a;
634         short8 sb = cast(short8)b;
635         short8 c = shufflevector!(short8, 0, 2, 4, 6, 8, 10, 12, 14)(sa, sb);
636         short8 d = shufflevector!(short8, 1, 3, 5, 7, 9, 11, 13, 15)(sa, sb);
637         return cast(__m128i)(c - d);
638     }
639     else 
640     {
641         short8 sa = cast(short8)a;
642         short8 sb = cast(short8)b;
643         short8 r;
644         r.ptr[0] = cast(short)(sa.array[0] - sa.array[1]);
645         r.ptr[1] = cast(short)(sa.array[2] - sa.array[3]);
646         r.ptr[2] = cast(short)(sa.array[4] - sa.array[5]);
647         r.ptr[3] = cast(short)(sa.array[6] - sa.array[7]);
648         r.ptr[4] = cast(short)(sb.array[0] - sb.array[1]);
649         r.ptr[5] = cast(short)(sb.array[2] - sb.array[3]);
650         r.ptr[6] = cast(short)(sb.array[4] - sb.array[5]);
651         r.ptr[7] = cast(short)(sb.array[6] - sb.array[7]);
652         return cast(__m128i)r;
653     }
654 }
655 unittest
656 {
657     __m128i A = _mm_setr_epi16(short.min, 1, 4, 8, 16, 32, 1, -32768);
658     short8 C = cast(short8) _mm_hsub_epi16(A, A);
659     short[8] correct = [ short.max, -4, -16, -32767, short.max, -4, -16, -32767];
660     assert(C.array == correct);
661 }
662 
663 /// Horizontally add adjacent pairs of 32-bit integers in `a` and `b`, and pack the signed 32-bit results.
664 __m128i _mm_hsub_epi32 (__m128i a, __m128i b) @trusted
665 { 
666     // PERF DMD
667     static if (GDC_with_SSSE3)
668     {
669         return cast(__m128i)__builtin_ia32_phsubd128(cast(int4)a, cast(int4)b);
670     }
671     else static if (LDC_with_SSSE3)
672     {
673         return cast(__m128i)__builtin_ia32_phsubd128(cast(int4)a, cast(int4)b);
674     }
675     else static if (LDC_with_ARM64)
676     {
677         // Produce uzp1 uzp2 sub sequence since LDC 1.8 -O1 
678         int4 ia = cast(int4)a;
679         int4 ib = cast(int4)b;
680         int4 c = shufflevector!(int4, 0, 2, 4, 6)(ia, ib);
681         int4 d = shufflevector!(int4, 1, 3, 5, 7)(ia, ib);
682         return cast(__m128i)(c - d);
683     }
684     else
685     {
686         int4 ia = cast(int4)a;
687         int4 ib = cast(int4)b;
688         int4 r;
689         r.ptr[0] = ia.array[0] - ia.array[1];
690         r.ptr[1] = ia.array[2] - ia.array[3];
691         r.ptr[2] = ib.array[0] - ib.array[1];
692         r.ptr[3] = ib.array[2] - ib.array[3];
693         return cast(__m128i)r;
694     }
695 }
696 unittest
697 {
698     __m128i A = _mm_setr_epi32(1, 2, int.min, 1);
699     __m128i B = _mm_setr_epi32(int.max, -1, 4, 4);
700     int4 C = cast(int4) _mm_hsub_epi32(A, B);
701     int[4] correct = [ -1, int.max, int.min, 0 ];
702     assert(C.array == correct);
703 }
704 
705 /// Horizontally subtract adjacent pairs of 16-bit integers in `a` and `b`, 
706 /// and pack the signed 16-bit results.
707 __m64 _mm_hsub_pi16 (__m64 a, __m64 b) @trusted
708 {
709     // PERF DMD
710     static if (GDC_with_SSSE3)
711     {
712         return cast(__m64)__builtin_ia32_phsubw(cast(short4)a, cast(short4)b);
713     }
714     else static if (LDC_with_ARM64)
715     {
716         // Produce uzp1 uzp2 sub sequence since LDC 1.3 -O1 
717         short4 sa = cast(short4)a;
718         short4 sb = cast(short4)b;
719         short4 c = shufflevector!(short4, 0, 2, 4, 6)(sa, sb);
720         short4 d = shufflevector!(short4, 1, 3, 5, 7)(sa, sb);
721         return cast(__m64)(c - d);
722     }
723     else
724     {
725         // LDC x86: generates phsubw since LDC 1.24 -O2
726         short4 sa = cast(short4)a;
727         short4 sb = cast(short4)b;
728         short4 r;
729         r.ptr[0] = cast(short)(sa.array[0] - sa.array[1]);
730         r.ptr[1] = cast(short)(sa.array[2] - sa.array[3]);
731         r.ptr[2] = cast(short)(sb.array[0] - sb.array[1]);
732         r.ptr[3] = cast(short)(sb.array[2] - sb.array[3]);
733         return cast(__m64)r;
734     }
735 }
736 unittest
737 {
738     __m64 A = _mm_setr_pi16(short.min, 1, 4, 8);
739     __m64 B = _mm_setr_pi16(16, 32, 1, -32768);
740     short4 C = cast(short4) _mm_hsub_pi16(A, B);
741     short[4] correct = [ short.max, -4, -16, -32767];
742     assert(C.array == correct);
743 }
744 
745 /// Horizontally subtract adjacent pairs of 32-bit integers in `a` and `b`, 
746 /// and pack the signed 32-bit results.
747 __m64 _mm_hsub_pi32 (__m64 a, __m64 b) @trusted
748 {
749     // PERF DMD
750     static if (GDC_with_SSSE3)
751     {
752         return cast(__m64)__builtin_ia32_phsubd(cast(int2)a, cast(int2)b);
753     }
754     else static if (LDC_with_ARM64)
755     {
756         // LDC arm64: generates zip1+zip2+sub sequence since LDC 1.8 -O1
757         int2 ia = cast(int2)a;
758         int2 ib = cast(int2)b;
759         int2 c = shufflevector!(int2, 0, 2)(ia, ib);
760         int2 d = shufflevector!(int2, 1, 3)(ia, ib);
761         return cast(__m64)(c - d);
762     }
763     else
764     {
765         // LDC x86: generates phsubd since LDC 1.24 -O2
766         int2 ia = cast(int2)a;
767         int2 ib = cast(int2)b;
768         int2 r;
769         r.ptr[0] = ia.array[0] - ia.array[1];
770         r.ptr[1] = ib.array[0] - ib.array[1];
771         return cast(__m64)r;
772     }
773 }
774 unittest
775 {
776     __m64 A = _mm_setr_pi32(int.min, 1);
777     __m64 B = _mm_setr_pi32(int.max, -1);
778     int2 C = cast(int2) _mm_hsub_pi32(A, B);
779     int[2] correct = [ int.max, int.min ];
780     assert(C.array == correct);
781 }
782 
783 /// Horizontally subtract adjacent pairs of signed 16-bit integers in `a` and `b` using saturation, 
784 /// and pack the signed 16-bit results.
785 __m128i _mm_hsubs_epi16 (__m128i a, __m128i b) @trusted
786 {
787      // PERF DMD
788     static if (GDC_with_SSSE3)
789     {
790         return cast(__m128i)__builtin_ia32_phsubsw128(cast(short8)a, cast(short8)b);
791     }
792     else static if (LDC_with_SSSE3)
793     {
794         return cast(__m128i)__builtin_ia32_phsubsw128(cast(short8)a, cast(short8)b);
795     }
796     else static if (LDC_with_ARM64)
797     {
798         // uzp1/uzp2/sqsub sequence
799         short8 sa = cast(short8)a;
800         short8 sb = cast(short8)b;
801         short8 c = shufflevector!(short8, 0, 2, 4, 6, 8, 10, 12, 14)(sa, sb);
802         short8 d = shufflevector!(short8, 1, 3, 5, 7, 9, 11, 13, 15)(sa, sb);
803         return cast(__m128i)vqsubq_s16(c, d);
804     }
805     else
806     {
807         short8 sa = cast(short8)a;
808         short8 sb = cast(short8)b;
809         short8 r;
810         r.ptr[0] = saturateSignedIntToSignedShort(sa.array[0] - sa.array[1]);
811         r.ptr[1] = saturateSignedIntToSignedShort(sa.array[2] - sa.array[3]);
812         r.ptr[2] = saturateSignedIntToSignedShort(sa.array[4] - sa.array[5]);
813         r.ptr[3] = saturateSignedIntToSignedShort(sa.array[6] - sa.array[7]);
814         r.ptr[4] = saturateSignedIntToSignedShort(sb.array[0] - sb.array[1]);
815         r.ptr[5] = saturateSignedIntToSignedShort(sb.array[2] - sb.array[3]);
816         r.ptr[6] = saturateSignedIntToSignedShort(sb.array[4] - sb.array[5]);
817         r.ptr[7] = saturateSignedIntToSignedShort(sb.array[6] - sb.array[7]);
818         return cast(__m128i)r;
819     }
820 }
821 unittest
822 {
823     __m128i A = _mm_setr_epi16(1, -2, 4, 8, 32767, -1, -10, 32767);
824     short8 C = cast(short8) _mm_hsubs_epi16(A, A);
825     short[8] correct = [ 3, -4, 32767, -32768, 3, -4, 32767, -32768 ];
826     assert(C.array == correct);
827 }
828 
829 
830 /// Horizontally subtract adjacent pairs of signed 16-bit integers in `a` and `b` using saturation, 
831 /// and pack the signed 16-bit results.
832 __m64 _mm_hsubs_pi16 (__m64 a, __m64 b) @trusted
833 {
834     static if (GDC_with_SSSE3)
835     {
836         return cast(__m64)__builtin_ia32_phsubsw(cast(short4)a, cast(short4)b);
837     }
838     else static if (LDC_with_SSSE3)
839     {
840         // Note: LDC doesn't have __builtin_ia32_phsubsw
841         long2 la;
842         la.ptr[0] = a.array[0];
843         long2 lb;
844         lb.ptr[0] = b.array[0];
845         int4 sum = cast(int4)__builtin_ia32_phsubsw128(cast(short8)la, cast(short8)lb);
846         int2 r;
847         r.ptr[0] = sum.array[0];
848         r.ptr[1] = sum.array[2];
849         return cast(__m64)r;
850     }
851     else static if (LDC_with_ARM64)
852     {
853         // uzp1/uzp2/sqsub sequence in -O1
854         short4 sa = cast(short4)a;
855         short4 sb = cast(short4)b;
856         short4 c = shufflevector!(short4, 0, 2, 4, 6)(sa, sb);
857         short4 d = shufflevector!(short4, 1, 3, 5, 7)(sa, sb);
858         return cast(__m64)vqsub_s16(c, d);
859     }
860     else
861     {
862         short4 sa = cast(short4)a;
863         short4 sb = cast(short4)b;
864         short4 r;
865         r.ptr[0] = saturateSignedIntToSignedShort(sa.array[0] - sa.array[1]);
866         r.ptr[1] = saturateSignedIntToSignedShort(sa.array[2] - sa.array[3]);
867         r.ptr[2] = saturateSignedIntToSignedShort(sb.array[0] - sb.array[1]);
868         r.ptr[3] = saturateSignedIntToSignedShort(sb.array[2] - sb.array[3]);
869         return cast(__m64)r;
870     }
871 }
872 unittest
873 {
874     __m64 A = _mm_setr_pi16(-16, 32, 100, -32768);
875     __m64 B = _mm_setr_pi16( 64, 30,   -9,  32767);
876     short4 C = cast(short4) _mm_hsubs_pi16(A, B);
877     short[4] correct = [ -48, 32767,  34,  -32768];
878     assert(C.array == correct);
879 }
880 
881 
882 /// Vertically multiply each unsigned 8-bit integer from `a` with the corresponding 
883 /// signed 8-bit integer from `b`, producing intermediate signed 16-bit integers. 
884 /// Horizontally add adjacent pairs of intermediate signed 16-bit integers, 
885 /// and pack the saturated results.
886 __m128i _mm_maddubs_epi16 (__m128i a, __m128i b) @trusted
887 {
888     static if (GDC_with_SSSE3)
889     {
890         return cast(__m128i)__builtin_ia32_pmaddubsw128(cast(byte16)a, cast(byte16)b);
891     }
892     else static if (LDC_with_SSSE3)
893     {
894         return cast(__m128i)__builtin_ia32_pmaddubsw128(cast(byte16)a, cast(byte16)b);
895     }
896     else
897     {
898         // zero-extend a to 16-bit
899         __m128i zero = _mm_setzero_si128();
900         __m128i a_lo = _mm_unpacklo_epi8(a, zero);
901         __m128i a_hi = _mm_unpackhi_epi8(a, zero);
902 
903         // sign-extend b to 16-bit
904         __m128i b_lo = _mm_unpacklo_epi8(b, zero);
905         __m128i b_hi = _mm_unpackhi_epi8(b, zero);    
906         b_lo = _mm_srai_epi16( _mm_slli_epi16(b_lo, 8), 8);
907         b_hi = _mm_srai_epi16( _mm_slli_epi16(b_hi, 8), 8); 
908 
909         // Multiply element-wise, no overflow can occur
910         __m128i c_lo = _mm_mullo_epi16(a_lo, b_lo);  
911         __m128i c_hi = _mm_mullo_epi16(a_hi, b_hi);
912 
913         // Add pairwise with saturating horizontal add
914         return _mm_hadds_epi16(c_lo, c_hi);
915     }
916 }
917 unittest
918 {
919     __m128i A = _mm_setr_epi8(  -1,  10, 100, -128, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0); // u8
920     __m128i B = _mm_setr_epi8(-128, -30, 100,  127, -1, 2, 4, 6, 0, 0, 0, 0, 0, 0, 0, 0); // i8
921     short8 C = cast(short8) _mm_maddubs_epi16(A, B);
922     short[8] correct =       [   -32768,     26256, 0, 0, 0, 0, 0, 0];
923     assert(C.array == correct);
924 }
925 
926 /// Vertically multiply each unsigned 8-bit integer from `a` with the corresponding 
927 /// signed 8-bit integer from `b`, producing intermediate signed 16-bit integers. 
928 /// Horizontally add adjacent pairs of intermediate signed 16-bit integers, 
929 /// and pack the saturated results.
930 __m64 _mm_maddubs_pi16 (__m64 a, __m64 b) @trusted
931 {
932     static if (GDC_with_SSSE3)
933     {
934         return cast(__m64)__builtin_ia32_pmaddubsw(cast(byte8)a, cast(byte8)b);
935     }
936     else static if (LDC_with_SSSE3)
937     {
938         __m128i A = to_m128i(a);
939         __m128i B = to_m128i(b);
940         return to_m64( cast(__m128i)__builtin_ia32_pmaddubsw128(cast(byte16) to_m128i(a), cast(byte16) to_m128i(b)));
941     }
942     else
943     {
944         // zero-extend a to 16-bit
945         __m128i zero = _mm_setzero_si128();
946         __m128i A = _mm_unpacklo_epi8(to_m128i(a), zero);
947 
948         // sign-extend b to 16-bit
949         __m128i B = _mm_unpacklo_epi8(to_m128i(b), zero);    
950         B = _mm_srai_epi16( _mm_slli_epi16(B, 8), 8);
951 
952         // Multiply element-wise, no overflow can occur
953         __m128i c = _mm_mullo_epi16(A, B);
954 
955         // Add pairwise with saturating horizontal add
956         return to_m64( _mm_hadds_epi16(c, zero));
957     }
958 }
959 unittest
960 {
961     __m64 A = _mm_setr_pi8(  -1,  10, 100, -128, 0, 0, 0, 0); // u8
962     __m64 B = _mm_setr_pi8(-128, -30, 100,  127, -1, 2, 4, 6); // i8
963     short4 C = cast(short4) _mm_maddubs_pi16(A, B);
964     short[4] correct =       [   -32768,   26256, 0, 0];
965     assert(C.array == correct);
966 }
967 
968 /// Multiply packed signed 16-bit integers in `a` and `b`, producing intermediate signed 32-bit integers.
969 /// Truncate each intermediate integer to the 18 most significant bits, round by adding 1, and return bits `[16:1]`.
970 __m128i _mm_mulhrs_epi16 (__m128i a, __m128i b) @trusted
971 {
972     // PERF DMD
973     static if (GDC_with_SSSE3)
974     {
975         return cast(__m128i) __builtin_ia32_pmulhrsw128(cast(short8)a, cast(short8)b);
976     }
977     else static if (LDC_with_SSSE3)
978     {
979         return cast(__m128i) __builtin_ia32_pmulhrsw128(cast(short8)a, cast(short8)b);
980     }
981     else static if (LDC_with_ARM64)
982     {
983         int4 mul_lo = vmull_s16(vget_low_s16(cast(short8)a),
984                                 vget_low_s16(cast(short8)b));
985         int4 mul_hi = vmull_s16(vget_high_s16(cast(short8)a),
986                                 vget_high_s16(cast(short8)b));
987 
988         // Rounding narrowing shift right
989         // narrow = (int16_t)((mul + 16384) >> 15);
990         short4 narrow_lo = vrshrn_n_s32(mul_lo, 15);
991         short4 narrow_hi = vrshrn_n_s32(mul_hi, 15);
992 
993         // Join together.
994         return cast(__m128i) vcombine_s16(narrow_lo, narrow_hi);
995     }
996     else
997     {
998         short8 sa = cast(short8)a;
999         short8 sb = cast(short8)b;
1000         short8 r;
1001 
1002         for (int i = 0; i < 8; ++i)
1003         {
1004             // I doubted it at first, but an exhaustive search show this to be equivalent to Intel pseudocode.
1005             r.ptr[i] = cast(short) ( (sa.array[i] * sb.array[i] + 0x4000) >> 15);
1006         }
1007 
1008         return cast(__m128i)r;
1009     }
1010 }
1011 
1012 unittest
1013 {
1014     __m128i A = _mm_setr_epi16(12345, -32768, 32767, 0, 1, 845, -6999, -1);
1015     __m128i B = _mm_setr_epi16(8877, -24487, 15678, 32760, 1, 0, -149, -1);
1016     short8 C = cast(short8) _mm_mulhrs_epi16(A, B);
1017     short[8] correct = [3344, 24487, 15678, 0, 0, 0, 32, 0];
1018     assert(C.array == correct);
1019 }
1020 
1021 /// Multiply packed signed 16-bit integers in `a` and `b`, producing intermediate signed 32-bit integers.
1022 /// Truncate each intermediate integer to the 18 most significant bits, round by adding 1, and return bits `[16:1]`.
1023 __m64 _mm_mulhrs_pi16 (__m64 a, __m64 b) @trusted
1024 {
1025     // PERF DMD
1026     static if (GDC_with_SSSE3)
1027     {
1028         return cast(__m64) __builtin_ia32_pmulhrsw(cast(short4)a, cast(short4)b);
1029     }
1030     else static if (LDC_with_SSSE3)
1031     {
1032         return cast(__m64) to_m64( cast(__m128i) __builtin_ia32_pmulhrsw128(cast(short8) to_m128i(a), cast(short8) to_m128i(b)));
1033     }
1034     else static if (LDC_with_ARM64)
1035     {
1036         int4 mul = vmull_s16(cast(short4)a, cast(short4)b);
1037 
1038         // Rounding narrowing shift right
1039         // (int16_t)((mul + 16384) >> 15);
1040         return cast(__m64) vrshrn_n_s32(mul, 15);
1041     }
1042     else
1043     {
1044         short4 sa = cast(short4)a;
1045         short4 sb = cast(short4)b;
1046         short4 r;
1047 
1048         for (int i = 0; i < 4; ++i)
1049         {
1050             r.ptr[i] = cast(short) ( (sa.array[i] * sb.array[i] + 0x4000) >> 15);
1051         }
1052         return cast(__m64)r;
1053     }
1054 }
1055 unittest
1056 {
1057     __m64 A = _mm_setr_pi16(12345, -32768, 32767, 0);
1058     __m64 B = _mm_setr_pi16(8877, -24487, 15678, 32760);
1059     short4 C = cast(short4) _mm_mulhrs_pi16(A, B);
1060     short[4] correct = [3344, 24487, 15678, 0];
1061     assert(C.array == correct);
1062 }
1063 
1064 
1065 /// Shuffle packed 8-bit integers in `a` according to shuffle control mask in the corresponding 8-bit element of `b`.
1066 __m128i _mm_shuffle_epi8 (__m128i a, __m128i b) @trusted
1067 {
1068     // This is the lovely pshufb.
1069     // PERF DMD
1070     static if (GDC_with_SSSE3)
1071     {
1072         return cast(__m128i) __builtin_ia32_pshufb128(cast(byte16) a, cast(byte16) b);
1073     }
1074     else static if (LDC_with_SSSE3)
1075     {
1076         return cast(__m128i) __builtin_ia32_pshufb128(cast(byte16) a, cast(byte16) b);
1077     }
1078     else static if (LDC_with_ARM64)
1079     {
1080         byte16 bb = cast(byte16)b;
1081         byte16 mask;
1082         mask = cast(byte)(0x8F);
1083         bb = bb & mask;
1084         byte16 r = vqtbl1q_s8(cast(byte16)a, bb);
1085         return cast(__m128i)r;
1086     }
1087     else
1088     {
1089         byte16 r;
1090         byte16 ba = cast(byte16)a;
1091         byte16 bb = cast(byte16)b;
1092         for (int i = 0; i < 16; ++i)
1093         {
1094             byte s = bb.array[i];
1095             r.ptr[i] = (s < 0) ? 0 : ba.array[ s & 15 ];
1096         }
1097         return cast(__m128i)r;
1098     }
1099 }
1100 unittest
1101 {
1102     __m128i A = _mm_setr_epi8(15,   14,      13,  12, 11,  10, 9, 8, 7, 6,  5,  4,  3,  2,  1,  0);
1103     __m128i B = _mm_setr_epi8(15, -128, 13 + 16, -12, 11, -10, 9, 8, 7, 6, -5,  4,  3, -2,  1,  0);
1104     byte16 C = cast(byte16) _mm_shuffle_epi8(A, B);
1105     byte[16] correct =         [0,   0,       2,  0,  4,   0, 6, 7, 8, 9,  0, 11, 12,  0, 14, 15];
1106     assert(C.array == correct);
1107 }
1108 
1109 /// Shuffle packed 8-bit integers in `a` according to shuffle control mask in the corresponding 8-bit element of `b`.
1110 __m64 _mm_shuffle_pi8 (__m64 a, __m64 b) @trusted
1111 {
1112     // PERF DMD
1113     static if (GDC_with_SSSE3)
1114     {
1115         alias ubyte8  =__vector(ubyte[8]);
1116         return cast(__m64) __builtin_ia32_pshufb(cast(ubyte8) a, cast(ubyte8) b);
1117     }
1118     else static if (LDC_with_SSSE3)
1119     {
1120         // GDC does proper dance to avoid mmx registers, do it manually in LDC since __builtin_ia32_pshufb doesn't exist there
1121         __m128i A = to_m128i(a);
1122         __m128i index = to_m128i(b);
1123         index = index & _mm_set1_epi32(0xF7F7F7F7);
1124         return to_m64( cast(__m128i) __builtin_ia32_pshufb128(cast(byte16)A, cast(byte16) index) );
1125     }
1126     else static if (LDC_with_ARM64)
1127     {
1128         byte8 bb = cast(byte8)b;
1129         byte8 mask;
1130         mask = cast(byte)(0x87);
1131         bb = bb & mask;
1132         __m128i l = to_m128i(a);
1133         byte8 r = vtbl1_s8(cast(byte16)l, cast(byte8)bb);
1134         return cast(__m64)r;
1135     }
1136     else
1137     {
1138         byte8 r;
1139         byte8 ba = cast(byte8)a;
1140         byte8 bb = cast(byte8)b;
1141         for (int i = 0; i < 8; ++i)
1142         {
1143             byte s = bb.array[i];
1144             r.ptr[i] = (s < 0) ? 0 : ba.array[ s & 7 ];
1145         }
1146         return cast(__m64)r;
1147     }
1148 }
1149 unittest
1150 {
1151     __m64 A = _mm_setr_pi8(7,  6,  5,  4,      3,  2,  1,  0);
1152     __m64 B = _mm_setr_pi8(7,  6, -5,  4,  3 + 8, -2,  1,  0);
1153     byte8 C = cast(byte8) _mm_shuffle_pi8(A, B);
1154     byte[8] correct =    [0,  1,  0,  3,      4,  0,  6,  7];
1155     assert(C.array == correct);
1156 }
1157 
1158 /// Negate packed 16-bit integers in `a` when the corresponding signed 16-bit integer in `b` is negative.
1159 /// Elements in result are zeroed out when the corresponding element in `b` is zero.
1160 __m128i _mm_sign_epi16 (__m128i a, __m128i b) @trusted
1161 {
1162     // PERF DMD
1163     static if (GDC_with_SSSE3)
1164     {
1165         return cast(__m128i) __builtin_ia32_psignw128(cast(short8)a, cast(short8)b);
1166     }
1167     else static if (LDC_with_SSSE3)
1168     {
1169         return cast(__m128i) __builtin_ia32_psignw128(cast(short8)a, cast(short8)b);       
1170     }
1171     else
1172     {
1173         // LDC arm64: 5 instructions
1174         __m128i mask = _mm_srai_epi16(b, 15);
1175         __m128i zeromask = _mm_cmpeq_epi16(b, _mm_setzero_si128());
1176         return _mm_andnot_si128(zeromask, _mm_xor_si128(_mm_add_epi16(a, mask), mask));
1177     }
1178 }
1179 unittest
1180 {
1181     __m128i A = _mm_setr_epi16(-2, -1, 0, 1,  2, short.min, short.min, short.min);
1182     __m128i B = _mm_setr_epi16(-1,  0,-1, 1, -2,       -50,         0,        50);
1183     short8 C = cast(short8) _mm_sign_epi16(A, B);
1184     short[8] correct =        [ 2,  0, 0, 1, -2, short.min,         0, short.min];
1185     assert(C.array == correct);
1186 }
1187 
1188 /// Negate packed 32-bit integers in `a` when the corresponding signed 32-bit integer in `b` is negative. 
1189 /// Elements in result are zeroed out when the corresponding element in `b` is zero.
1190 __m128i _mm_sign_epi32 (__m128i a, __m128i b) @trusted
1191 {
1192     // PERF DMD
1193     static if (GDC_with_SSSE3)
1194     {
1195         return cast(__m128i) __builtin_ia32_psignd128(cast(short8)a, cast(short8)b);
1196     }
1197     else static if (LDC_with_SSSE3)
1198     {
1199         return cast(__m128i) __builtin_ia32_psignd128(cast(short8)a, cast(short8)b);
1200     }
1201     else
1202     {
1203         __m128i mask = _mm_srai_epi32(b, 31);
1204         __m128i zeromask = _mm_cmpeq_epi32(b, _mm_setzero_si128());
1205         return _mm_andnot_si128(zeromask, _mm_xor_si128(_mm_add_epi32(a, mask), mask));
1206     }
1207 }
1208 unittest
1209 {
1210     __m128i A = _mm_setr_epi32(-2, -1,  0, int.max);
1211     __m128i B = _mm_setr_epi32(-1,  0, -1, 1);
1212     int4 C = cast(int4) _mm_sign_epi32(A, B);
1213     int[4] correct =          [ 2,  0, 0, int.max];
1214     assert(C.array == correct);
1215 }
1216 
1217 /// Negate packed 8-bit integers in `a` when the corresponding signed 8-bit integer in `b` is negative. 
1218 /// Elements in result are zeroed out when the corresponding element in `b` is zero.
1219 __m128i _mm_sign_epi8 (__m128i a, __m128i b) @trusted
1220 {
1221     // PERF DMD
1222     static if (GDC_with_SSSE3)
1223     {
1224         return cast(__m128i) __builtin_ia32_psignb128(cast(byte16)a, cast(byte16)b);
1225     }
1226     else static if (LDC_with_SSSE3)
1227     {
1228         return cast(__m128i) __builtin_ia32_psignb128(cast(byte16)a, cast(byte16)b);
1229     }
1230     else
1231     {
1232         __m128i mask = _mm_cmplt_epi8(b, _mm_setzero_si128()); // extend sign bit
1233         __m128i zeromask = _mm_cmpeq_epi8(b, _mm_setzero_si128());
1234         return _mm_andnot_si128(zeromask, _mm_xor_si128(_mm_add_epi8(a, mask), mask));
1235     }
1236 }
1237 unittest
1238 {
1239     __m128i A = _mm_setr_epi8(-2, -1, 0, 1,  2, byte.min, byte.min, byte.min, -1,  0,-1, 1, -2,      -50,        0,       50);
1240     __m128i B = _mm_setr_epi8(-1,  0,-1, 1, -2,      -50,        0,       50, -2, -1, 0, 1,  2, byte.min, byte.min, byte.min);
1241     byte16  C = cast(byte16) _mm_sign_epi8(A, B);
1242     byte[16] correct =       [ 2,  0, 0, 1, -2, byte.min,        0, byte.min,  1,  0, 0, 1, -2,       50,        0,      -50];
1243     assert(C.array == correct);
1244 }
1245 
1246 /// Negate packed 16-bit integers in `a`  when the corresponding signed 16-bit integer in `b` is negative.
1247 /// Element in result are zeroed out when the corresponding element in `b` is zero.
1248 __m64 _mm_sign_pi16 (__m64 a, __m64 b) @trusted
1249 {
1250     return to_m64( _mm_sign_epi16( to_m128i(a), to_m128i(b)) );
1251 }
1252 unittest
1253 {
1254     __m64 A = _mm_setr_pi16( 2, short.min, short.min, short.min);
1255     __m64 B = _mm_setr_pi16(-2,       -50,         0,        50);
1256     short4 C = cast(short4) _mm_sign_pi16(A, B);
1257     short[4] correct =     [-2, short.min,         0, short.min];
1258     assert(C.array == correct);
1259 }
1260 
1261 /// Negate packed 32-bit integers in `a`  when the corresponding signed 32-bit integer in `b` is negative.
1262 /// Element in result are zeroed out when the corresponding element in `b` is zero.
1263 __m64 _mm_sign_pi32 (__m64 a, __m64 b) @trusted
1264 {
1265     return to_m64( _mm_sign_epi32( to_m128i(a), to_m128i(b)) );
1266 }
1267 unittest
1268 {
1269     __m64 A = _mm_setr_pi32(-2, -100);
1270     __m64 B = _mm_setr_pi32(-1,  0);
1271     int2 C = cast(int2) _mm_sign_pi32(A, B);
1272     int[2] correct =          [ 2,  0];
1273     assert(C.array == correct);
1274 }
1275 
1276 /// Negate packed 8-bit integers in `a` when the corresponding signed 8-bit integer in `b` is negative. 
1277 /// Elements in result are zeroed out when the corresponding element in `b` is zero.
1278 __m64 _mm_sign_pi8 (__m64 a, __m64 b) @trusted
1279 {
1280     return to_m64( _mm_sign_epi8( to_m128i(a), to_m128i(b)) );
1281 }
1282 unittest
1283 {
1284     __m64 A = _mm_setr_pi8(-2, -1, 0, 1,  2, byte.min, byte.min, byte.min);
1285     __m64 B = _mm_setr_pi8(-1,  0,-1, 1, -2,      -50,        0,       50);
1286     byte8  C = cast(byte8) _mm_sign_pi8(A, B);
1287     byte[8] correct =     [ 2,  0, 0, 1, -2, byte.min,        0, byte.min];
1288     assert(C.array == correct);
1289 }