1 /**
2 * SSSE3 intrinsics.
3 *
4 * Copyright: Guillaume Piolat 2021.
5 *            Johan Engelen 2021.
6 * License:   $(LINK2 http://www.boost.org/LICENSE_1_0.txt, Boost License 1.0)
7 */
8 module inteli.tmmintrin;
9 
10 public import inteli.types;
11 import inteli.internals;
12 
13 public import inteli.pmmintrin;
14 import inteli.mmx;
15 
16 nothrow @nogc:
17 
18 
19 // SSSE3 instructions
20 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#techs=SSSE3
21 // Note: this header will work whether you have SSSE3 enabled or not.
22 // With LDC, use "dflags-ldc": ["-mattr=+ssse3"] or equivalent to actively 
23 // generate SSE3 instructions.
24 
25 /// Compute the absolute value of packed signed 16-bit integers in `a`.
26 __m128i _mm_abs_epi16 (__m128i a) @trusted
27 {
28     static if (DMD_with_DSIMD)
29     {
30         return cast(__m128i)__simd(XMM.PABSW, a);
31     }
32     else static if (GDC_with_SSSE3)
33     {
34         return cast(__m128i) __builtin_ia32_pabsw128(cast(short8)a);
35     }
36     else static if (LDC_with_ARM64)
37     {
38         return cast(__m128i) vabsq_s16(cast(short8)a);
39     }
40     else
41     {
42         // LDC x86: generate pabsw since LDC 1.1 -O2
43         short8 sa = cast(short8)a;
44         for (int i = 0; i < 8; ++i)
45         {
46             short s = sa.array[i];
47             sa.ptr[i] = s >= 0 ? s : cast(short)(-cast(int)(s));
48         }  
49         return cast(__m128i)sa;
50     }
51 }
52 unittest
53 {
54     __m128i A = _mm_setr_epi16(0, -1, -32768, 32767, 10, -10, 1000, -1000);
55     short8 B = cast(short8) _mm_abs_epi16(A);
56     short[8] correct = [0, 1, -32768, 32767, 10, 10, 1000, 1000];
57     assert(B.array == correct);
58 }
59 
60 /// Compute the absolute value of packed signed 32-bit integers in `a`.
61 __m128i _mm_abs_epi32 (__m128i a) @trusted
62 {
63     static if (DMD_with_DSIMD)
64     {
65         return cast(__m128i)__simd(XMM.PABSD, cast(int4)a);
66     }
67     else static if (GDC_with_SSSE3)
68     {
69         return cast(__m128i) __builtin_ia32_pabsd128(cast(int4)a);
70     }
71     else static if (LDC_with_ARM64)
72     {
73         return cast(__m128i) vabsq_s32(cast(int4)a);
74     }
75     else
76     {
77         // LDC x86: generates pabsd since LDC 1.1 -O2
78         int4 sa = cast(int4)a;
79         for (int i = 0; i < 4; ++i)
80         {
81             int s = sa.array[i];
82             sa.ptr[i] = s >= 0 ? s : -s;
83         }  
84         return cast(__m128i)sa;
85     } 
86 }
87 unittest
88 {
89     __m128i A = _mm_setr_epi32(0, -1, -2_147_483_648, -2_147_483_647);
90     int4 B = cast(int4) _mm_abs_epi32(A);
91     int[4] correct = [0, 1, -2_147_483_648, 2_147_483_647];
92     assert(B.array == correct);
93 }
94 
95 /// Compute the absolute value of packed signed 8-bit integers in `a`.
96 __m128i _mm_abs_epi8 (__m128i a) @trusted
97 {
98     static if (DMD_with_DSIMD)
99     {
100         return cast(__m128i)__simd(XMM.PABSB, cast(byte16)a);
101     }
102     else static if (GDC_with_SSSE3)
103     {
104         alias ubyte16 = __vector(ubyte[16]);
105         return cast(__m128i) __builtin_ia32_pabsb128(cast(ubyte16)a);
106     }
107     else static if (LDC_with_ARM64)
108     {
109         return cast(__m128i) vabsq_s8(cast(byte16)a);
110     }
111     else static if (LDC_with_SSSE3)
112     {
113         return __asm!__m128i("pabsb $1,$0","=x,x",a);
114     }
115     else
116     {
117         // A loop version like in _mm_abs_epi16/_mm_abs_epi32 would be very slow 
118         // in LDC x86 and wouldn't vectorize. Doesn't generate pabsb in LDC though.
119         return _mm_min_epu8(a, _mm_sub_epi8(_mm_setzero_si128(), a));
120     }
121 }
122 unittest
123 {
124     __m128i A = _mm_setr_epi8(0, -1, -128, -127, 127, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
125     byte16 B = cast(byte16) _mm_abs_epi8(A);
126     byte[16] correct =       [0,  1, -128,  127, 127, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0];
127     assert(B.array == correct);
128 }
129 
130 /// Compute the absolute value of packed signed 16-bit integers in `a`.
131 __m64 _mm_abs_pi16 (__m64 a) @trusted
132 {
133     return to_m64(_mm_abs_epi16(to_m128i(a)));
134 }
135 unittest
136 {
137     __m64 A = _mm_setr_pi16(0, -1, -32768, 32767);
138     short4 B = cast(short4) _mm_abs_pi16(A);
139     short[4] correct = [0, 1, -32768, 32767];
140     assert(B.array == correct);
141 }
142 
143 /// Compute the absolute value of packed signed 32-bit integers in `a`.
144 __m64 _mm_abs_pi32 (__m64 a) @trusted
145 {
146      return to_m64(_mm_abs_epi32(to_m128i(a)));
147 }
148 unittest
149 {
150     __m64 A = _mm_setr_pi32(-1, -2_147_483_648);
151     int2 B = cast(int2) _mm_abs_pi32(A);
152     int[2] correct = [1, -2_147_483_648];
153     assert(B.array == correct);
154 }
155 
156 /// Compute the absolute value of packed signed 8-bit integers in `a`.
157 __m64 _mm_abs_pi8 (__m64 a) @trusted
158 {
159     return to_m64(_mm_abs_epi8(to_m128i(a)));
160 }
161 unittest
162 {
163     __m64 A = _mm_setr_pi8(0, -1, -128, -127, 127, 0, 0, 0);
164     byte8 B = cast(byte8) _mm_abs_pi8(A);
165     byte[8] correct =       [0,  1, -128,  127, 127, 0, 0, 0];
166     assert(B.array == correct);
167 }
168 
169 /// Concatenate 16-byte blocks in `a` and `b` into a 32-byte temporary result, shift the result right by `count` bytes, and return the low 16 bytes.
170 __m128i _mm_alignr_epi8(ubyte count)(__m128i a, __m128i b) @trusted
171 {
172     static assert(count < 32);
173 
174     // PERF DMD
175     static if (GDC_with_SSSE3)
176     {
177         return cast(__m128i)__builtin_ia32_palignr128(cast(long2)a, cast(long2)b, count * 8);
178     }
179     else version(LDC)
180     {
181         static if (count < 16)
182         {
183             // Generates palignr since LDC 1.1 -O1
184             // Also generates a single ext instruction on arm64.
185             return cast(__m128i) shufflevector!(byte16, ( 0 + count) % 32,
186                                                         ( 1 + count) % 32,
187                                                         ( 2 + count) % 32,
188                                                         ( 3 + count) % 32,
189                                                         ( 4 + count) % 32,
190                                                         ( 5 + count) % 32,
191                                                         ( 6 + count) % 32,
192                                                         ( 7 + count) % 32,
193                                                         ( 8 + count) % 32,
194                                                         ( 9 + count) % 32,
195                                                         (10 + count) % 32,
196                                                         (11 + count) % 32,
197                                                         (12 + count) % 32,
198                                                         (13 + count) % 32,
199                                                         (14 + count) % 32,
200                                                         (15 + count) % 32)(cast(byte16)b, cast(byte16)a);
201         }
202         else
203         {
204             return cast(__m128i) shufflevector!(byte16, ( 0 + count) % 32,
205                                                         ( 1 + count) % 32,
206                                                         ( 2 + count) % 32,
207                                                         ( 3 + count) % 32,
208                                                         ( 4 + count) % 32,
209                                                         ( 5 + count) % 32,
210                                                         ( 6 + count) % 32,
211                                                         ( 7 + count) % 32,
212                                                         ( 8 + count) % 32,
213                                                         ( 9 + count) % 32,
214                                                         (10 + count) % 32,
215                                                         (11 + count) % 32,
216                                                         (12 + count) % 32,
217                                                         (13 + count) % 32,
218                                                         (14 + count) % 32,
219                                                         (15 + count) % 32)(cast(byte16)_mm_setzero_si128(), cast(byte16)a);
220         }
221     }
222     else
223     {
224         byte16 ab = cast(byte16)a;
225         byte16 bb = cast(byte16)b;
226         byte16 r;
227 
228         for (int i = 0; i < 16; ++i)
229         {
230             const int srcpos = count + cast(int)i;
231             if (srcpos > 31) 
232             {
233                 r.ptr[i] = 0;
234             } 
235             else if (srcpos > 15) 
236             {
237                 r.ptr[i] = ab[(srcpos) & 15];
238             } 
239             else 
240             {
241                 r.ptr[i] = bb[srcpos];
242             }
243        }
244        return cast(__m128i)r;
245     }
246 }
247 unittest
248 {
249     __m128i A = _mm_setr_epi8(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
250     __m128i B = _mm_setr_epi8(17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32);
251 
252     {
253         byte16 C = cast(byte16)_mm_alignr_epi8!0(A ,B);
254         byte[16] correct = [17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32];
255         assert(C.array == correct);
256     }
257     {
258         byte16 C = cast(byte16)_mm_alignr_epi8!20(A ,B);
259         _mm_print_epi8(_mm_alignr_epi8!20(A ,B));
260         byte[16] correct = [5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 0, 0, 0, 0];
261         assert(C.array == correct);
262     }
263 
264     __m128i D = _mm_setr_epi8(-123, -82, 103, -69, 103, -26, 9, 106, 58, -11, 79, -91, 114, -13, 110, 60);
265     __m128i E = _mm_setr_epi8(25, -51, -32, 91, -85, -39, -125, 31, -116, 104, 5, -101, 127, 82, 14, 81);
266     byte16 F = cast(byte16)_mm_alignr_epi8!8(D, E);
267     byte[16] correct = [-116, 104, 5, -101, 127, 82, 14, 81, -123, -82, 103, -69, 103, -26, 9, 106];
268     assert(F.array == correct);
269 }
270 
271 /// Concatenate 8-byte blocks in `a` and `b` into a 16-byte temporary result, shift the result right by `count` bytes, and return the low 8 bytes.
272 __m64 _mm_alignr_pi8(ubyte count)(__m64 a, __m64 b) @trusted
273 {
274     // PERF DMD
275     static if (GDC_with_SSSE3)
276     {
277         return cast(__m64)__builtin_ia32_palignr(cast(long)a, cast(long)b, count * 8);
278     }
279     else
280     {
281         // Note: in LDC x86 this uses a pshufb.
282         // Generates ext in arm64.
283         return cast(__m64) shufflevector!(byte8, (0 + count) % 16,
284                                                  (1 + count) % 16,
285                                                  (2 + count) % 16,
286                                                  (3 + count) % 16,
287                                                  (4 + count) % 16,
288                                                  (5 + count) % 16,
289                                                  (6 + count) % 16,
290                                                  (7 + count) % 16)(cast(byte8)a, cast(byte8)b);
291     }
292 }
293 unittest
294 {
295     __m64 A = _mm_setr_pi8(1, 2, 3, 4, 5, 6, 7, 8);
296     __m64 B = _mm_setr_pi8(17, 18, 19, 20, 21, 22, 23, 24);
297 
298     {
299         byte8 C = cast(byte8)_mm_alignr_pi8!3(A ,B);
300         byte[8] correct = [4, 5, 6, 7, 8, 17, 18, 19];
301         assert(C.array == correct);
302     }
303     {
304         byte8 C = cast(byte8)_mm_alignr_pi8!10(A ,B);
305         byte[8] correct = [19, 20, 21, 22, 23, 24, 1, 2];
306         assert(C.array == correct);
307     }
308 }
309 
310 /// Horizontally add adjacent pairs of 16-bit integers in `a` and `b`, and pack the signed 16-bit results.
311 __m128i _mm_hadd_epi16 (__m128i a, __m128i b) @trusted
312 {
313     // PERF DMD
314     static if (GDC_with_SSSE3)
315     {
316         return cast(__m128i)__builtin_ia32_phaddw128(cast(short8)a, cast(short8)b);
317     }
318     else static if (LDC_with_SSSE3)
319     {
320         return cast(__m128i)__builtin_ia32_phaddw128(cast(short8)a, cast(short8)b);
321     }
322     else static if (LDC_with_ARM64)
323     {
324         return cast(__m128i)vpaddq_s16(cast(short8)a, cast(short8)b);
325     }
326     else
327     {
328         short8 sa = cast(short8)a;
329         short8 sb = cast(short8)b;
330         short8 r;
331         r.ptr[0] = cast(short)(sa.array[0] + sa.array[1]);
332         r.ptr[1] = cast(short)(sa.array[2] + sa.array[3]);
333         r.ptr[2] = cast(short)(sa.array[4] + sa.array[5]);
334         r.ptr[3] = cast(short)(sa.array[6] + sa.array[7]);
335         r.ptr[4] = cast(short)(sb.array[0] + sb.array[1]);
336         r.ptr[5] = cast(short)(sb.array[2] + sb.array[3]);
337         r.ptr[6] = cast(short)(sb.array[4] + sb.array[5]);
338         r.ptr[7] = cast(short)(sb.array[6] + sb.array[7]);
339         return cast(__m128i)r;
340     }
341 }
342 unittest
343 {
344     __m128i A = _mm_setr_epi16(1, -2, 4, 8, 16, 32, -1, -32768);
345     short8 C = cast(short8) _mm_hadd_epi16(A, A);
346     short[8] correct = [ -1, 12, 48, 32767, -1, 12, 48, 32767];
347     assert(C.array == correct);
348 }
349 
350 /// Horizontally add adjacent pairs of 32-bit integers in `a` and `b`, and pack the signed 32-bit results.
351 __m128i _mm_hadd_epi32 (__m128i a, __m128i b) @trusted
352 { 
353     // PERF DMD
354     static if (GDC_with_SSSE3)
355     {
356         return cast(__m128i)__builtin_ia32_phaddd128(cast(int4)a, cast(int4)b);
357     }
358     else static if (LDC_with_SSSE3)
359     {
360         return cast(__m128i)__builtin_ia32_phaddd128(cast(int4)a, cast(int4)b);
361     }
362     else static if (LDC_with_ARM64)
363     {
364         return cast(__m128i)vpaddq_s32(cast(int4)a, cast(int4)b);
365     }
366     else
367     {
368         int4 ia = cast(int4)a;
369         int4 ib = cast(int4)b;
370         int4 r;
371         r.ptr[0] = ia.array[0] + ia.array[1];
372         r.ptr[1] = ia.array[2] + ia.array[3];
373         r.ptr[2] = ib.array[0] + ib.array[1];
374         r.ptr[3] = ib.array[2] + ib.array[3];
375         return cast(__m128i)r;
376     }
377 }
378 unittest
379 {
380     __m128i A = _mm_setr_epi32(1, -2, int.min, -1);
381     __m128i B = _mm_setr_epi32(1, int.max, 4, -4);
382     int4 C = cast(int4) _mm_hadd_epi32(A, B);
383     int[4] correct = [ -1, int.max, int.min, 0 ];
384     assert(C.array == correct);
385 }
386 
387 /// Horizontally add adjacent pairs of 16-bit integers in `a` and `b`, and pack the signed 16-bit results.
388 __m64 _mm_hadd_pi16 (__m64 a, __m64 b) @trusted
389 {
390     // PERF DMD
391     static if (GDC_with_SSSE3)
392     {
393         return cast(__m64) __builtin_ia32_phaddw(cast(short4)a, cast(short4)b);
394     }
395     else static if (LDC_with_ARM64)
396     {
397         return cast(__m64) vpadd_s16(cast(short4)a, cast(short4)b);
398     }
399     else
400     {
401         // LDC x86: generates phaddw since LDC 1.24 -O2.
402         short4 r;
403         short4 sa = cast(short4)a;
404         short4 sb = cast(short4)b;
405         r.ptr[0] = cast(short)(sa.array[0] + sa.array[1]); 
406         r.ptr[1] = cast(short)(sa.array[2] + sa.array[3]);
407         r.ptr[2] = cast(short)(sb.array[0] + sb.array[1]);
408         r.ptr[3] = cast(short)(sb.array[2] + sb.array[3]);
409         return cast(__m64)r;
410     }
411 }
412 unittest
413 {
414     __m64 A = _mm_setr_pi16(1, -2, 4, 8);
415     __m64 B = _mm_setr_pi16(16, 32, -1, -32768);
416     short4 C = cast(short4) _mm_hadd_pi16(A, B);
417     short[4] correct = [ -1, 12, 48, 32767 ];
418     assert(C.array == correct);
419 }
420 
421 /// Horizontally add adjacent pairs of 32-bit integers in `a` and `b`, 
422 /// and pack the signed 32-bit results.
423 __m64 _mm_hadd_pi32 (__m64 a, __m64 b) @trusted
424 {
425     // PERF DMD
426     static if (GDC_with_SSSE3)
427     {
428         return cast(__m64) __builtin_ia32_phaddd(cast(int2)a, cast(int2)b);
429     }
430     else static if (LDC_with_ARM64)
431     {
432         return cast(__m64)vpadd_s32(cast(int2)a, cast(int2)b);
433     }
434     else
435     {
436         // LDC x86: generates phaddd since LDC 1.24 -O2
437         int2 ia = cast(int2)a;
438         int2 ib = cast(int2)b;
439         int2 r;
440         r.ptr[0] = ia.array[0] + ia.array[1];
441         r.ptr[1] = ib.array[0] + ib.array[1];
442         return cast(__m64)r;
443     }
444 }
445 unittest
446 {
447     __m64 A = _mm_setr_pi32(int.min, -1);
448     __m64 B = _mm_setr_pi32(1, int.max);
449     int2 C = cast(int2) _mm_hadd_pi32(A, B);
450     int[2] correct = [ int.max, int.min ];
451     assert(C.array == correct);
452 }
453 
454 /// Horizontally add adjacent pairs of signed 16-bit integers in `a` and `b` using saturation, 
455 /// and pack the signed 16-bit results.
456 __m128i _mm_hadds_epi16 (__m128i a, __m128i b) @trusted
457 {
458      // PERF DMD
459     static if (GDC_with_SSSE3)
460     {
461         return cast(__m128i)__builtin_ia32_phaddsw128(cast(short8)a, cast(short8)b);
462     }
463     else static if (LDC_with_SSSE3)
464     {
465         return cast(__m128i)__builtin_ia32_phaddsw128(cast(short8)a, cast(short8)b);
466     }
467     else static if (LDC_with_ARM64)
468     {
469         // uzp1/uzp2/sqadd sequence
470         short8 sa = cast(short8)a;
471         short8 sb = cast(short8)b;
472         short8 c = shufflevector!(short8, 0, 2, 4, 6, 8, 10, 12, 14)(sa, sb);
473         short8 d = shufflevector!(short8, 1, 3, 5, 7, 9, 11, 13, 15)(sa, sb);
474         return cast(__m128i)vqaddq_s16(c, d);
475     }
476     else
477     {
478         short8 sa = cast(short8)a;
479         short8 sb = cast(short8)b;
480         short8 r;
481         r.ptr[0] = saturateSignedIntToSignedShort(sa.array[0] + sa.array[1]);
482         r.ptr[1] = saturateSignedIntToSignedShort(sa.array[2] + sa.array[3]);
483         r.ptr[2] = saturateSignedIntToSignedShort(sa.array[4] + sa.array[5]);
484         r.ptr[3] = saturateSignedIntToSignedShort(sa.array[6] + sa.array[7]);
485         r.ptr[4] = saturateSignedIntToSignedShort(sb.array[0] + sb.array[1]);
486         r.ptr[5] = saturateSignedIntToSignedShort(sb.array[2] + sb.array[3]);
487         r.ptr[6] = saturateSignedIntToSignedShort(sb.array[4] + sb.array[5]);
488         r.ptr[7] = saturateSignedIntToSignedShort(sb.array[6] + sb.array[7]);
489         return cast(__m128i)r;
490     }
491 }
492 unittest
493 {
494     __m128i A = _mm_setr_epi16(1, -2, 4, 8, 16, 32, -1, -32768);
495     short8 C = cast(short8) _mm_hadds_epi16(A, A);
496     short[8] correct = [ -1, 12, 48, -32768, -1, 12, 48, -32768];
497     assert(C.array == correct);
498 }
499 
500 /// Horizontally add adjacent pairs of signed 16-bit integers in `a` and `b` using saturation, 
501 /// and pack the signed 16-bit results.
502 __m64 _mm_hadds_pi16 (__m64 a, __m64 b) @trusted
503 {
504     static if (GDC_with_SSSE3)
505     {
506         return cast(__m64)__builtin_ia32_phaddsw(cast(short4)a, cast(short4)b);
507     }
508     else static if (LDC_with_SSSE3)
509     {
510         // Note: LDC doesn't have __builtin_ia32_phaddsw
511         long2 la;
512         la.ptr[0] = a.array[0];
513         long2 lb;
514         lb.ptr[0] = b.array[0];
515         int4 sum = cast(int4)__builtin_ia32_phaddsw128(cast(short8)la, cast(short8)lb);
516         int2 r;
517         r.ptr[0] = sum.array[0];
518         r.ptr[1] = sum.array[2];
519         return cast(__m64)r;
520     }
521     else static if (LDC_with_ARM64)
522     {
523         // uzp1/uzp2/sqadd sequence
524         short4 sa = cast(short4)a;
525         short4 sb = cast(short4)b;
526         short4 c = shufflevector!(short4, 0, 2, 4, 6)(sa, sb);
527         short4 d = shufflevector!(short4, 1, 3, 5, 7)(sa, sb);
528         return cast(__m64)vqadd_s16(c, d);
529     }
530     else
531     {
532         short4 sa = cast(short4)a;
533         short4 sb = cast(short4)b;
534         short4 r;
535         r.ptr[0] = saturateSignedIntToSignedShort(sa.array[0] + sa.array[1]);
536         r.ptr[1] = saturateSignedIntToSignedShort(sa.array[2] + sa.array[3]);
537         r.ptr[2] = saturateSignedIntToSignedShort(sb.array[0] + sb.array[1]);
538         r.ptr[3] = saturateSignedIntToSignedShort(sb.array[2] + sb.array[3]);
539         return cast(__m64)r;
540     }
541 }
542 unittest
543 {
544     __m64 A = _mm_setr_pi16(-16, 32, -100, -32768);
545     __m64 B = _mm_setr_pi16( 64, 32,    1,  32767);
546     short4 C = cast(short4) _mm_hadds_pi16(A, B);
547     short[4] correct = [ 16, -32768,  96,  32767];
548     assert(C.array == correct);
549 }
550 
551 
552 /// Horizontally add adjacent pairs of 16-bit integers in `a` and `b`, and pack the signed 16-bit results.
553 __m128i _mm_hsub_epi16 (__m128i a, __m128i b) @trusted
554 {
555     // PERF DMD
556     static if (GDC_with_SSSE3)
557     {
558         return cast(__m128i)__builtin_ia32_phsubw128(cast(short8)a, cast(short8)b);
559     }
560     else static if (LDC_with_SSSE3)
561     {
562         return cast(__m128i)__builtin_ia32_phsubw128(cast(short8)a, cast(short8)b);
563     }
564     else static if (LDC_with_ARM64)
565     {
566         // Produce uzp1 uzp2 sub sequence since LDC 1.8 -O1 
567         short8 sa = cast(short8)a;
568         short8 sb = cast(short8)b;
569         short8 c = shufflevector!(short8, 0, 2, 4, 6, 8, 10, 12, 14)(sa, sb);
570         short8 d = shufflevector!(short8, 1, 3, 5, 7, 9, 11, 13, 15)(sa, sb);
571         return cast(__m128i)(c - d);
572     }
573     else 
574     {
575         short8 sa = cast(short8)a;
576         short8 sb = cast(short8)b;
577         short8 r;
578         r.ptr[0] = cast(short)(sa.array[0] - sa.array[1]);
579         r.ptr[1] = cast(short)(sa.array[2] - sa.array[3]);
580         r.ptr[2] = cast(short)(sa.array[4] - sa.array[5]);
581         r.ptr[3] = cast(short)(sa.array[6] - sa.array[7]);
582         r.ptr[4] = cast(short)(sb.array[0] - sb.array[1]);
583         r.ptr[5] = cast(short)(sb.array[2] - sb.array[3]);
584         r.ptr[6] = cast(short)(sb.array[4] - sb.array[5]);
585         r.ptr[7] = cast(short)(sb.array[6] - sb.array[7]);
586         return cast(__m128i)r;
587     }
588 }
589 unittest
590 {
591     __m128i A = _mm_setr_epi16(short.min, 1, 4, 8, 16, 32, 1, -32768);
592     short8 C = cast(short8) _mm_hsub_epi16(A, A);
593     short[8] correct = [ short.max, -4, -16, -32767, short.max, -4, -16, -32767];
594     assert(C.array == correct);
595 }
596 
597 /// Horizontally add adjacent pairs of 32-bit integers in `a` and `b`, and pack the signed 32-bit results.
598 __m128i _mm_hsub_epi32 (__m128i a, __m128i b) @trusted
599 { 
600     // PERF DMD
601     static if (GDC_with_SSSE3)
602     {
603         return cast(__m128i)__builtin_ia32_phsubd128(cast(int4)a, cast(int4)b);
604     }
605     else static if (LDC_with_SSSE3)
606     {
607         return cast(__m128i)__builtin_ia32_phsubd128(cast(int4)a, cast(int4)b);
608     }
609     else static if (LDC_with_ARM64)
610     {
611         // Produce uzp1 uzp2 sub sequence since LDC 1.8 -O1 
612         int4 ia = cast(int4)a;
613         int4 ib = cast(int4)b;
614         int4 c = shufflevector!(int4, 0, 2, 4, 6)(ia, ib);
615         int4 d = shufflevector!(int4, 1, 3, 5, 7)(ia, ib);
616         return cast(__m128i)(c - d);
617     }
618     else
619     {
620         int4 ia = cast(int4)a;
621         int4 ib = cast(int4)b;
622         int4 r;
623         r.ptr[0] = ia.array[0] - ia.array[1];
624         r.ptr[1] = ia.array[2] - ia.array[3];
625         r.ptr[2] = ib.array[0] - ib.array[1];
626         r.ptr[3] = ib.array[2] - ib.array[3];
627         return cast(__m128i)r;
628     }
629 }
630 unittest
631 {
632     __m128i A = _mm_setr_epi32(1, 2, int.min, 1);
633     __m128i B = _mm_setr_epi32(int.max, -1, 4, 4);
634     int4 C = cast(int4) _mm_hsub_epi32(A, B);
635     int[4] correct = [ -1, int.max, int.min, 0 ];
636     assert(C.array == correct);
637 }
638 
639 /// Horizontally subtract adjacent pairs of 16-bit integers in `a` and `b`, 
640 /// and pack the signed 16-bit results.
641 __m64 _mm_hsub_pi16 (__m64 a, __m64 b) @trusted
642 {
643     // PERF DMD
644     static if (GDC_with_SSSE3)
645     {
646         return cast(__m64)__builtin_ia32_phsubw(cast(short4)a, cast(short4)b);
647     }
648     else static if (LDC_with_ARM64)
649     {
650         // Produce uzp1 uzp2 sub sequence since LDC 1.3 -O1 
651         short4 sa = cast(short4)a;
652         short4 sb = cast(short4)b;
653         short4 c = shufflevector!(short4, 0, 2, 4, 6)(sa, sb);
654         short4 d = shufflevector!(short4, 1, 3, 5, 7)(sa, sb);
655         return cast(__m64)(c - d);
656     }
657     else
658     {
659         // LDC x86: generates phsubw since LDC 1.24 -O2
660         short4 sa = cast(short4)a;
661         short4 sb = cast(short4)b;
662         short4 r;
663         r.ptr[0] = cast(short)(sa.array[0] - sa.array[1]);
664         r.ptr[1] = cast(short)(sa.array[2] - sa.array[3]);
665         r.ptr[2] = cast(short)(sb.array[0] - sb.array[1]);
666         r.ptr[3] = cast(short)(sb.array[2] - sb.array[3]);
667         return cast(__m64)r;
668     }
669 }
670 unittest
671 {
672     __m64 A = _mm_setr_pi16(short.min, 1, 4, 8);
673     __m64 B = _mm_setr_pi16(16, 32, 1, -32768);
674     short4 C = cast(short4) _mm_hsub_pi16(A, B);
675     short[4] correct = [ short.max, -4, -16, -32767];
676     assert(C.array == correct);
677 }
678 
679 /// Horizontally subtract adjacent pairs of 32-bit integers in `a` and `b`, 
680 /// and pack the signed 32-bit results.
681 __m64 _mm_hsub_pi32 (__m64 a, __m64 b) @trusted
682 {
683     // PERF DMD
684     static if (GDC_with_SSSE3)
685     {
686         return cast(__m64)__builtin_ia32_phsubd(cast(int2)a, cast(int2)b);
687     }
688     else static if (LDC_with_ARM64)
689     {
690         // LDC arm64: generates zip1+zip2+sub sequence since LDC 1.8 -O1
691         int2 ia = cast(int2)a;
692         int2 ib = cast(int2)b;
693         int2 c = shufflevector!(int2, 0, 2)(ia, ib);
694         int2 d = shufflevector!(int2, 1, 3)(ia, ib);
695         return cast(__m64)(c - d);
696     }
697     else
698     {
699         // LDC x86: generates phsubd since LDC 1.24 -O2
700         int2 ia = cast(int2)a;
701         int2 ib = cast(int2)b;
702         int2 r;
703         r.ptr[0] = ia.array[0] - ia.array[1];
704         r.ptr[1] = ib.array[0] - ib.array[1];
705         return cast(__m64)r;
706     }
707 }
708 unittest
709 {
710     __m64 A = _mm_setr_pi32(int.min, 1);
711     __m64 B = _mm_setr_pi32(int.max, -1);
712     int2 C = cast(int2) _mm_hsub_pi32(A, B);
713     int[2] correct = [ int.max, int.min ];
714     assert(C.array == correct);
715 }
716 
717 /// Horizontally subtract adjacent pairs of signed 16-bit integers in `a` and `b` using saturation, 
718 /// and pack the signed 16-bit results.
719 __m128i _mm_hsubs_epi16 (__m128i a, __m128i b) @trusted
720 {
721      // PERF DMD
722     static if (GDC_with_SSSE3)
723     {
724         return cast(__m128i)__builtin_ia32_phsubsw128(cast(short8)a, cast(short8)b);
725     }
726     else static if (LDC_with_SSSE3)
727     {
728         return cast(__m128i)__builtin_ia32_phsubsw128(cast(short8)a, cast(short8)b);
729     }
730     else static if (LDC_with_ARM64)
731     {
732         // uzp1/uzp2/sqsub sequence
733         short8 sa = cast(short8)a;
734         short8 sb = cast(short8)b;
735         short8 c = shufflevector!(short8, 0, 2, 4, 6, 8, 10, 12, 14)(sa, sb);
736         short8 d = shufflevector!(short8, 1, 3, 5, 7, 9, 11, 13, 15)(sa, sb);
737         return cast(__m128i)vqsubq_s16(c, d);
738     }
739     else
740     {
741         short8 sa = cast(short8)a;
742         short8 sb = cast(short8)b;
743         short8 r;
744         r.ptr[0] = saturateSignedIntToSignedShort(sa.array[0] - sa.array[1]);
745         r.ptr[1] = saturateSignedIntToSignedShort(sa.array[2] - sa.array[3]);
746         r.ptr[2] = saturateSignedIntToSignedShort(sa.array[4] - sa.array[5]);
747         r.ptr[3] = saturateSignedIntToSignedShort(sa.array[6] - sa.array[7]);
748         r.ptr[4] = saturateSignedIntToSignedShort(sb.array[0] - sb.array[1]);
749         r.ptr[5] = saturateSignedIntToSignedShort(sb.array[2] - sb.array[3]);
750         r.ptr[6] = saturateSignedIntToSignedShort(sb.array[4] - sb.array[5]);
751         r.ptr[7] = saturateSignedIntToSignedShort(sb.array[6] - sb.array[7]);
752         return cast(__m128i)r;
753     }
754 }
755 unittest
756 {
757     __m128i A = _mm_setr_epi16(1, -2, 4, 8, 32767, -1, -10, 32767);
758     short8 C = cast(short8) _mm_hsubs_epi16(A, A);
759     short[8] correct = [ 3, -4, 32767, -32768, 3, -4, 32767, -32768 ];
760     assert(C.array == correct);
761 }
762 
763 
764 /// Horizontally subtract adjacent pairs of signed 16-bit integers in `a` and `b` using saturation, 
765 /// and pack the signed 16-bit results.
766 __m64 _mm_hsubs_pi16 (__m64 a, __m64 b) @trusted
767 {
768     static if (GDC_with_SSSE3)
769     {
770         return cast(__m64)__builtin_ia32_phsubsw(cast(short4)a, cast(short4)b);
771     }
772     else static if (LDC_with_SSSE3)
773     {
774         // Note: LDC doesn't have __builtin_ia32_phsubsw
775         long2 la;
776         la.ptr[0] = a.array[0];
777         long2 lb;
778         lb.ptr[0] = b.array[0];
779         int4 sum = cast(int4)__builtin_ia32_phsubsw128(cast(short8)la, cast(short8)lb);
780         int2 r;
781         r.ptr[0] = sum.array[0];
782         r.ptr[1] = sum.array[2];
783         return cast(__m64)r;
784     }
785     else static if (LDC_with_ARM64)
786     {
787         // uzp1/uzp2/sqsub sequence in -O1
788         short4 sa = cast(short4)a;
789         short4 sb = cast(short4)b;
790         short4 c = shufflevector!(short4, 0, 2, 4, 6)(sa, sb);
791         short4 d = shufflevector!(short4, 1, 3, 5, 7)(sa, sb);
792         return cast(__m64)vqsub_s16(c, d);
793     }
794     else
795     {
796         short4 sa = cast(short4)a;
797         short4 sb = cast(short4)b;
798         short4 r;
799         r.ptr[0] = saturateSignedIntToSignedShort(sa.array[0] - sa.array[1]);
800         r.ptr[1] = saturateSignedIntToSignedShort(sa.array[2] - sa.array[3]);
801         r.ptr[2] = saturateSignedIntToSignedShort(sb.array[0] - sb.array[1]);
802         r.ptr[3] = saturateSignedIntToSignedShort(sb.array[2] - sb.array[3]);
803         return cast(__m64)r;
804     }
805 }
806 unittest
807 {
808     __m64 A = _mm_setr_pi16(-16, 32, 100, -32768);
809     __m64 B = _mm_setr_pi16( 64, 30,   -9,  32767);
810     short4 C = cast(short4) _mm_hsubs_pi16(A, B);
811     short[4] correct = [ -48, 32767,  34,  -32768];
812     assert(C.array == correct);
813 }
814 
815 
816 /// Vertically multiply each unsigned 8-bit integer from `a` with the corresponding 
817 /// signed 8-bit integer from `b`, producing intermediate signed 16-bit integers. 
818 /// Horizontally add adjacent pairs of intermediate signed 16-bit integers, 
819 /// and pack the saturated results.
820 __m128i _mm_maddubs_epi16 (__m128i a, __m128i b) @trusted
821 {
822     static if (GDC_with_SSSE3)
823     {
824         return cast(__m128i)__builtin_ia32_pmaddubsw128(cast(byte16)a, cast(byte16)b);
825     }
826     else static if (LDC_with_SSSE3)
827     {
828         return cast(__m128i)__builtin_ia32_pmaddubsw128(cast(byte16)a, cast(byte16)b);
829     }
830     else
831     {
832         // zero-extend a to 16-bit
833         __m128i zero = _mm_setzero_si128();
834         __m128i a_lo = _mm_unpacklo_epi8(a, zero);
835         __m128i a_hi = _mm_unpackhi_epi8(a, zero);
836 
837         // sign-extend b to 16-bit
838         __m128i b_lo = _mm_unpacklo_epi8(b, zero);
839         __m128i b_hi = _mm_unpackhi_epi8(b, zero);    
840         b_lo = _mm_srai_epi16( _mm_slli_epi16(b_lo, 8), 8);
841         b_hi = _mm_srai_epi16( _mm_slli_epi16(b_hi, 8), 8); 
842 
843         // Multiply element-wise, no overflow can occur
844         __m128i c_lo = _mm_mullo_epi16(a_lo, b_lo);  
845         __m128i c_hi = _mm_mullo_epi16(a_hi, b_hi);
846 
847         // Add pairwise with saturating horizontal add
848         return _mm_hadds_epi16(c_lo, c_hi);
849     }
850 }
851 unittest
852 {
853     __m128i A = _mm_setr_epi8(  -1,  10, 100, -128, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0); // u8
854     __m128i B = _mm_setr_epi8(-128, -30, 100,  127, -1, 2, 4, 6, 0, 0, 0, 0, 0, 0, 0, 0); // i8
855     short8 C = cast(short8) _mm_maddubs_epi16(A, B);
856     short[8] correct =       [   -32768,     26256, 0, 0, 0, 0, 0, 0];
857     assert(C.array == correct);
858 }
859 
860 /// Vertically multiply each unsigned 8-bit integer from `a` with the corresponding 
861 /// signed 8-bit integer from `b`, producing intermediate signed 16-bit integers. 
862 /// Horizontally add adjacent pairs of intermediate signed 16-bit integers, 
863 /// and pack the saturated results.
864 __m64 _mm_maddubs_pi16 (__m64 a, __m64 b) @trusted
865 {
866     static if (GDC_with_SSSE3)
867     {
868         return cast(__m64)__builtin_ia32_pmaddubsw(cast(byte8)a, cast(byte8)b);
869     }
870     else static if (LDC_with_SSSE3)
871     {
872         __m128i A = to_m128i(a);
873         __m128i B = to_m128i(b);
874         return to_m64( cast(__m128i)__builtin_ia32_pmaddubsw128(cast(byte16) to_m128i(a), cast(byte16) to_m128i(b)));
875     }
876     else
877     {
878         // zero-extend a to 16-bit
879         __m128i zero = _mm_setzero_si128();
880         __m128i A = _mm_unpacklo_epi8(to_m128i(a), zero);
881 
882         // sign-extend b to 16-bit
883         __m128i B = _mm_unpacklo_epi8(to_m128i(b), zero);    
884         B = _mm_srai_epi16( _mm_slli_epi16(B, 8), 8);
885 
886         // Multiply element-wise, no overflow can occur
887         __m128i c = _mm_mullo_epi16(A, B);
888 
889         // Add pairwise with saturating horizontal add
890         return to_m64( _mm_hadds_epi16(c, zero));
891     }
892 }
893 unittest
894 {
895     __m64 A = _mm_setr_pi8(  -1,  10, 100, -128, 0, 0, 0, 0); // u8
896     __m64 B = _mm_setr_pi8(-128, -30, 100,  127, -1, 2, 4, 6); // i8
897     short4 C = cast(short4) _mm_maddubs_pi16(A, B);
898     short[4] correct =       [   -32768,   26256, 0, 0];
899     assert(C.array == correct);
900 }
901 
902 /// Multiply packed signed 16-bit integers in `a` and `b`, producing intermediate signed 32-bit integers.
903 /// Truncate each intermediate integer to the 18 most significant bits, round by adding 1, and return bits `[16:1]`.
904 __m128i _mm_mulhrs_epi16 (__m128i a, __m128i b) @trusted
905 {
906     // PERF DMD
907     static if (GDC_with_SSSE3)
908     {
909         return cast(__m128i) __builtin_ia32_pmulhrsw128(cast(short8)a, cast(short8)b);
910     }
911     else static if (LDC_with_SSSE3)
912     {
913         return cast(__m128i) __builtin_ia32_pmulhrsw128(cast(short8)a, cast(short8)b);
914     }
915     else static if (LDC_with_ARM64)
916     {
917         int4 mul_lo = vmull_s16(vget_low_s16(cast(short8)a),
918                                 vget_low_s16(cast(short8)b));
919         int4 mul_hi = vmull_s16(vget_high_s16(cast(short8)a),
920                                 vget_high_s16(cast(short8)b));
921 
922         // Rounding narrowing shift right
923         // narrow = (int16_t)((mul + 16384) >> 15);
924         short4 narrow_lo = vrshrn_n_s32(mul_lo, 15);
925         short4 narrow_hi = vrshrn_n_s32(mul_hi, 15);
926 
927         // Join together.
928         return cast(__m128i) vcombine_s16(narrow_lo, narrow_hi);
929     }
930     else
931     {
932         short8 sa = cast(short8)a;
933         short8 sb = cast(short8)b;
934         short8 r;
935 
936         for (int i = 0; i < 8; ++i)
937         {
938             // I doubted it at first, but an exhaustive search show this to be equivalent to Intel pseudocode.
939             r.ptr[i] = cast(short) ( (sa.array[i] * sb.array[i] + 0x4000) >> 15);
940         }
941 
942         return cast(__m128i)r;
943     }
944 }
945 
946 unittest
947 {
948     __m128i A = _mm_setr_epi16(12345, -32768, 32767, 0, 1, 845, -6999, -1);
949     __m128i B = _mm_setr_epi16(8877, -24487, 15678, 32760, 1, 0, -149, -1);
950     short8 C = cast(short8) _mm_mulhrs_epi16(A, B);
951     short[8] correct = [3344, 24487, 15678, 0, 0, 0, 32, 0];
952     assert(C.array == correct);
953 }
954 
955 /// Multiply packed signed 16-bit integers in `a` and `b`, producing intermediate signed 32-bit integers.
956 /// Truncate each intermediate integer to the 18 most significant bits, round by adding 1, and return bits `[16:1]`.
957 __m64 _mm_mulhrs_pi16 (__m64 a, __m64 b) @trusted
958 {
959     // PERF DMD
960     static if (GDC_with_SSSE3)
961     {
962         return cast(__m64) __builtin_ia32_pmulhrsw(cast(short4)a, cast(short4)b);
963     }
964     else static if (LDC_with_SSSE3)
965     {
966         return cast(__m64) to_m64( cast(__m128i) __builtin_ia32_pmulhrsw128(cast(short8) to_m128i(a), cast(short8) to_m128i(b)));
967     }
968     else static if (LDC_with_ARM64)
969     {
970         int4 mul = vmull_s16(cast(short4)a, cast(short4)b);
971 
972         // Rounding narrowing shift right
973         // (int16_t)((mul + 16384) >> 15);
974         return cast(__m64) vrshrn_n_s32(mul, 15);
975     }
976     else
977     {
978         short4 sa = cast(short4)a;
979         short4 sb = cast(short4)b;
980         short4 r;
981 
982         for (int i = 0; i < 4; ++i)
983         {
984             r.ptr[i] = cast(short) ( (sa.array[i] * sb.array[i] + 0x4000) >> 15);
985         }
986         return cast(__m64)r;
987     }
988 }
989 unittest
990 {
991     __m64 A = _mm_setr_pi16(12345, -32768, 32767, 0);
992     __m64 B = _mm_setr_pi16(8877, -24487, 15678, 32760);
993     short4 C = cast(short4) _mm_mulhrs_pi16(A, B);
994     short[4] correct = [3344, 24487, 15678, 0];
995     assert(C.array == correct);
996 }
997 
998 
999 /// Shuffle packed 8-bit integers in `a` according to shuffle control mask in the corresponding 8-bit element of `b`.
1000 __m128i _mm_shuffle_epi8 (__m128i a, __m128i b) @trusted
1001 {
1002     // This is the lovely pshufb.
1003     // PERF DMD
1004     static if (GDC_with_SSSE3)
1005     {
1006         return cast(__m128i) __builtin_ia32_pshufb128(cast(byte16) a, cast(byte16) b);
1007     }
1008     else static if (LDC_with_SSSE3)
1009     {
1010         return cast(__m128i) __builtin_ia32_pshufb128(cast(byte16) a, cast(byte16) b);
1011     }
1012     else static if (LDC_with_ARM64)
1013     {
1014         byte16 bb = cast(byte16)b;
1015         byte16 mask;
1016         mask = cast(byte)(0x8F);
1017         bb = bb & mask;
1018         byte16 r = vqtbl1q_s8(cast(byte16)a, bb);
1019         return cast(__m128i)r;
1020     }
1021     else
1022     {
1023         byte16 r;
1024         byte16 ba = cast(byte16)a;
1025         byte16 bb = cast(byte16)b;
1026         for (int i = 0; i < 16; ++i)
1027         {
1028             byte s = bb.array[i];
1029             r.ptr[i] = (s < 0) ? 0 : ba.array[ s & 15 ];
1030         }
1031         return cast(__m128i)r;
1032     }
1033 }
1034 unittest
1035 {
1036     __m128i A = _mm_setr_epi8(15,   14,      13,  12, 11,  10, 9, 8, 7, 6,  5,  4,  3,  2,  1,  0);
1037     __m128i B = _mm_setr_epi8(15, -128, 13 + 16, -12, 11, -10, 9, 8, 7, 6, -5,  4,  3, -2,  1,  0);
1038     byte16 C = cast(byte16) _mm_shuffle_epi8(A, B);
1039     byte[16] correct =         [0,   0,       2,  0,  4,   0, 6, 7, 8, 9,  0, 11, 12,  0, 14, 15];
1040     assert(C.array == correct);
1041 }
1042 
1043 /// Shuffle packed 8-bit integers in `a` according to shuffle control mask in the corresponding 8-bit element of `b`.
1044 __m64 _mm_shuffle_pi8 (__m64 a, __m64 b) @trusted
1045 {
1046     // PERF DMD
1047     static if (GDC_with_SSSE3)
1048     {
1049         alias ubyte8  =__vector(ubyte[8]);
1050         return cast(__m64) __builtin_ia32_pshufb(cast(ubyte8) a, cast(ubyte8) b);
1051     }
1052     else static if (LDC_with_SSSE3)
1053     {
1054         // GDC does proper dance to avoid mmx registers, do it manually in LDC since __builtin_ia32_pshufb doesn't exist there
1055         __m128i A = to_m128i(a);
1056         __m128i index = to_m128i(b);
1057         index = index & _mm_set1_epi32(0xF7F7F7F7);
1058         return to_m64( cast(__m128i) __builtin_ia32_pshufb128(cast(byte16)A, cast(byte16) index) );
1059     }
1060     else static if (LDC_with_ARM64)
1061     {
1062         byte8 bb = cast(byte8)b;
1063         byte8 mask;
1064         mask = cast(byte)(0x87);
1065         bb = bb & mask;
1066         __m128i l = to_m128i(a);
1067         byte8 r = vtbl1_s8(cast(byte16)l, cast(byte8)bb);
1068         return cast(__m64)r;
1069     }
1070     else
1071     {
1072         byte8 r;
1073         byte8 ba = cast(byte8)a;
1074         byte8 bb = cast(byte8)b;
1075         for (int i = 0; i < 8; ++i)
1076         {
1077             byte s = bb.array[i];
1078             r.ptr[i] = (s < 0) ? 0 : ba.array[ s & 7 ];
1079         }
1080         return cast(__m64)r;
1081     }
1082 }
1083 unittest
1084 {
1085     __m64 A = _mm_setr_pi8(7,  6,  5,  4,      3,  2,  1,  0);
1086     __m64 B = _mm_setr_pi8(7,  6, -5,  4,  3 + 8, -2,  1,  0);
1087     byte8 C = cast(byte8) _mm_shuffle_pi8(A, B);
1088     byte[8] correct =    [0,  1,  0,  3,      4,  0,  6,  7];
1089     assert(C.array == correct);
1090 }
1091 
1092 /// Negate packed 16-bit integers in `a` when the corresponding signed 16-bit integer in `b` is negative.
1093 /// Elements in result are zeroed out when the corresponding element in `b` is zero.
1094 __m128i _mm_sign_epi16 (__m128i a, __m128i b) @trusted
1095 {
1096     // PERF DMD
1097     static if (GDC_with_SSSE3)
1098     {
1099         return cast(__m128i) __builtin_ia32_psignw128(cast(short8)a, cast(short8)b);
1100     }
1101     else static if (LDC_with_SSSE3)
1102     {
1103         return cast(__m128i) __builtin_ia32_psignw128(cast(short8)a, cast(short8)b);       
1104     }
1105     else
1106     {
1107         // LDC arm64: 5 instructions
1108         __m128i mask = _mm_srai_epi16(b, 15);
1109         __m128i zeromask = _mm_cmpeq_epi16(b, _mm_setzero_si128());
1110         return _mm_andnot_si128(zeromask, _mm_xor_si128(_mm_add_epi16(a, mask), mask));
1111     }
1112 }
1113 unittest
1114 {
1115     __m128i A = _mm_setr_epi16(-2, -1, 0, 1,  2, short.min, short.min, short.min);
1116     __m128i B = _mm_setr_epi16(-1,  0,-1, 1, -2,       -50,         0,        50);
1117     short8 C = cast(short8) _mm_sign_epi16(A, B);
1118     short[8] correct =        [ 2,  0, 0, 1, -2, short.min,         0, short.min];
1119     assert(C.array == correct);
1120 }
1121 
1122 /// Negate packed 32-bit integers in `a` when the corresponding signed 32-bit integer in `b` is negative. 
1123 /// Elements in result are zeroed out when the corresponding element in `b` is zero.
1124 __m128i _mm_sign_epi32 (__m128i a, __m128i b) @trusted
1125 {
1126     // PERF DMD
1127     static if (GDC_with_SSSE3)
1128     {
1129         return cast(__m128i) __builtin_ia32_psignd128(cast(short8)a, cast(short8)b);
1130     }
1131     else static if (LDC_with_SSSE3)
1132     {
1133         return cast(__m128i) __builtin_ia32_psignd128(cast(short8)a, cast(short8)b);
1134     }
1135     else
1136     {
1137         __m128i mask = _mm_srai_epi32(b, 31);
1138         __m128i zeromask = _mm_cmpeq_epi32(b, _mm_setzero_si128());
1139         return _mm_andnot_si128(zeromask, _mm_xor_si128(_mm_add_epi32(a, mask), mask));
1140     }
1141 }
1142 unittest
1143 {
1144     __m128i A = _mm_setr_epi32(-2, -1,  0, int.max);
1145     __m128i B = _mm_setr_epi32(-1,  0, -1, 1);
1146     int4 C = cast(int4) _mm_sign_epi32(A, B);
1147     int[4] correct =          [ 2,  0, 0, int.max];
1148     assert(C.array == correct);
1149 }
1150 
1151 /// Negate packed 8-bit integers in `a` when the corresponding signed 8-bit integer in `b` is negative. 
1152 /// Elements in result are zeroed out when the corresponding element in `b` is zero.
1153 __m128i _mm_sign_epi8 (__m128i a, __m128i b) @trusted
1154 {
1155     // PERF DMD
1156     static if (GDC_with_SSSE3)
1157     {
1158         return cast(__m128i) __builtin_ia32_psignb128(cast(byte16)a, cast(byte16)b);
1159     }
1160     else static if (LDC_with_SSSE3)
1161     {
1162         return cast(__m128i) __builtin_ia32_psignb128(cast(byte16)a, cast(byte16)b);
1163     }
1164     else
1165     {
1166         __m128i mask = _mm_cmplt_epi8(b, _mm_setzero_si128()); // extend sign bit
1167         __m128i zeromask = _mm_cmpeq_epi8(b, _mm_setzero_si128());
1168         return _mm_andnot_si128(zeromask, _mm_xor_si128(_mm_add_epi8(a, mask), mask));
1169     }
1170 }
1171 unittest
1172 {
1173     __m128i A = _mm_setr_epi8(-2, -1, 0, 1,  2, byte.min, byte.min, byte.min, -1,  0,-1, 1, -2,      -50,        0,       50);
1174     __m128i B = _mm_setr_epi8(-1,  0,-1, 1, -2,      -50,        0,       50, -2, -1, 0, 1,  2, byte.min, byte.min, byte.min);
1175     byte16  C = cast(byte16) _mm_sign_epi8(A, B);
1176     byte[16] correct =       [ 2,  0, 0, 1, -2, byte.min,        0, byte.min,  1,  0, 0, 1, -2,       50,        0,      -50];
1177     assert(C.array == correct);
1178 }
1179 
1180 /// Negate packed 16-bit integers in `a`  when the corresponding signed 16-bit integer in `b` is negative.
1181 /// Element in result are zeroed out when the corresponding element in `b` is zero.
1182 __m64 _mm_sign_pi16 (__m64 a, __m64 b) @trusted
1183 {
1184     return to_m64( _mm_sign_epi16( to_m128i(a), to_m128i(b)) );
1185 }
1186 unittest
1187 {
1188     __m64 A = _mm_setr_pi16( 2, short.min, short.min, short.min);
1189     __m64 B = _mm_setr_pi16(-2,       -50,         0,        50);
1190     short4 C = cast(short4) _mm_sign_pi16(A, B);
1191     short[4] correct =     [-2, short.min,         0, short.min];
1192     assert(C.array == correct);
1193 }
1194 
1195 /// Negate packed 32-bit integers in `a`  when the corresponding signed 32-bit integer in `b` is negative.
1196 /// Element in result are zeroed out when the corresponding element in `b` is zero.
1197 __m64 _mm_sign_pi32 (__m64 a, __m64 b) @trusted
1198 {
1199     return to_m64( _mm_sign_epi32( to_m128i(a), to_m128i(b)) );
1200 }
1201 unittest
1202 {
1203     __m64 A = _mm_setr_pi32(-2, -100);
1204     __m64 B = _mm_setr_pi32(-1,  0);
1205     int2 C = cast(int2) _mm_sign_pi32(A, B);
1206     int[2] correct =          [ 2,  0];
1207     assert(C.array == correct);
1208 }
1209 
1210 /// Negate packed 8-bit integers in `a` when the corresponding signed 8-bit integer in `b` is negative. 
1211 /// Elements in result are zeroed out when the corresponding element in `b` is zero.
1212 __m64 _mm_sign_pi8 (__m64 a, __m64 b) @trusted
1213 {
1214     return to_m64( _mm_sign_epi8( to_m128i(a), to_m128i(b)) );
1215 }
1216 unittest
1217 {
1218     __m64 A = _mm_setr_pi8(-2, -1, 0, 1,  2, byte.min, byte.min, byte.min);
1219     __m64 B = _mm_setr_pi8(-1,  0,-1, 1, -2,      -50,        0,       50);
1220     byte8  C = cast(byte8) _mm_sign_pi8(A, B);
1221     byte[8] correct =     [ 2,  0, 0, 1, -2, byte.min,        0, byte.min];
1222     assert(C.array == correct);
1223 }