1 /**
2 * SSSE3 intrinsics.
3 *
4 * Copyright: Guillaume Piolat 2021.
5 *            Johan Engelen 2021.
6 * License:   $(LINK2 http://www.boost.org/LICENSE_1_0.txt, Boost License 1.0)
7 */
8 module inteli.tmmintrin;
9 
10 public import inteli.types;
11 import inteli.internals;
12 
13 public import inteli.pmmintrin;
14 import inteli.mmx;
15 
16 nothrow @nogc:
17 
18 
19 // SSSE3 instructions
20 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#techs=SSSE3
21 // Note: this header will work whether you have SSSE3 enabled or not.
22 // With LDC, use "dflags-ldc": ["-mattr=+ssse3"] or equivalent to actively 
23 // generate SSE3 instructions.
24 
25 /// Compute the absolute value of packed signed 16-bit integers in `a`.
26 __m128i _mm_abs_epi16 (__m128i a) @trusted
27 {
28     static if (DMD_with_DSIMD)
29     {
30         return cast(__m128i)__simd(XMM.PABSW, a);
31     }
32     else static if (GDC_with_SSSE3)
33     {
34         return cast(__m128i) __builtin_ia32_pabsw128(cast(short8)a);
35     }
36     else static if (LDC_with_ARM64)
37     {
38         return cast(__m128i) vabsq_s16(cast(short8)a);
39     }
40     else
41     {
42         // LDC x86: generate pabsw since LDC 1.1 -O2
43         short8 sa = cast(short8)a;
44         for (int i = 0; i < 8; ++i)
45         {
46             short s = sa.array[i];
47             sa.ptr[i] = s >= 0 ? s : cast(short)(-cast(int)(s));
48         }  
49         return cast(__m128i)sa;
50     }
51 }
52 unittest
53 {
54     __m128i A = _mm_setr_epi16(0, -1, -32768, 32767, 10, -10, 1000, -1000);
55     short8 B = cast(short8) _mm_abs_epi16(A);
56     short[8] correct = [0, 1, -32768, 32767, 10, 10, 1000, 1000];
57     assert(B.array == correct);
58 }
59 
60 /// Compute the absolute value of packed signed 32-bit integers in `a`.
61 __m128i _mm_abs_epi32 (__m128i a) @trusted
62 {
63     static if (DMD_with_DSIMD)
64     {
65         return cast(__m128i)__simd(XMM.PABSD, cast(int4)a);
66     }
67     else static if (GDC_with_SSSE3)
68     {
69         return cast(__m128i) __builtin_ia32_pabsd128(cast(int4)a);
70     }
71     else static if (LDC_with_ARM64)
72     {
73         return cast(__m128i) vabsq_s32(cast(int4)a);
74     }
75     else
76     {
77         // LDC x86: generates pabsd since LDC 1.1 -O2
78         int4 sa = cast(int4)a;
79         for (int i = 0; i < 4; ++i)
80         {
81             int s = sa.array[i];
82             sa.ptr[i] = s >= 0 ? s : -s;
83         }  
84         return cast(__m128i)sa;
85     } 
86 }
87 unittest
88 {
89     __m128i A = _mm_setr_epi32(0, -1, -2_147_483_648, -2_147_483_647);
90     int4 B = cast(int4) _mm_abs_epi32(A);
91     int[4] correct = [0, 1, -2_147_483_648, 2_147_483_647];
92     assert(B.array == correct);
93 }
94 
95 /// Compute the absolute value of packed signed 8-bit integers in `a`.
96 __m128i _mm_abs_epi8 (__m128i a) @trusted
97 {
98     static if (DMD_with_DSIMD)
99     {
100         return cast(__m128i)__simd(XMM.PABSB, cast(byte16)a);
101     }
102     else static if (GDC_with_SSSE3)
103     {
104         alias ubyte16 = __vector(ubyte[16]);
105         return cast(__m128i) __builtin_ia32_pabsb128(cast(ubyte16)a);
106     }
107     else static if (LDC_with_ARM64)
108     {
109         return cast(__m128i) vabsq_s8(cast(byte16)a);
110     }
111     else static if (LDC_with_SSSE3)
112     {
113         return __asm!__m128i("pabsb $1,$0","=x,x",a);
114     }
115     else
116     {
117         // A loop version like in _mm_abs_epi16/_mm_abs_epi32 would be very slow 
118         // in LDC x86 and wouldn't vectorize. Doesn't generate pabsb in LDC though.
119         return _mm_min_epu8(a, _mm_sub_epi8(_mm_setzero_si128(), a));
120     }
121 }
122 unittest
123 {
124     __m128i A = _mm_setr_epi8(0, -1, -128, -127, 127, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
125     byte16 B = cast(byte16) _mm_abs_epi8(A);
126     byte[16] correct =       [0,  1, -128,  127, 127, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0];
127     assert(B.array == correct);
128 }
129 
130 /// Compute the absolute value of packed signed 16-bit integers in `a`.
131 __m64 _mm_abs_pi16 (__m64 a) @trusted
132 {
133     return to_m64(_mm_abs_epi16(to_m128i(a)));
134 }
135 unittest
136 {
137     __m64 A = _mm_setr_pi16(0, -1, -32768, 32767);
138     short4 B = cast(short4) _mm_abs_pi16(A);
139     short[4] correct = [0, 1, -32768, 32767];
140     assert(B.array == correct);
141 }
142 
143 /// Compute the absolute value of packed signed 32-bit integers in `a`.
144 __m64 _mm_abs_pi32 (__m64 a) @trusted
145 {
146      return to_m64(_mm_abs_epi32(to_m128i(a)));
147 }
148 unittest
149 {
150     __m64 A = _mm_setr_pi32(-1, -2_147_483_648);
151     int2 B = cast(int2) _mm_abs_pi32(A);
152     int[2] correct = [1, -2_147_483_648];
153     assert(B.array == correct);
154 }
155 
156 /// Compute the absolute value of packed signed 8-bit integers in `a`.
157 __m64 _mm_abs_pi8 (__m64 a) @trusted
158 {
159     return to_m64(_mm_abs_epi8(to_m128i(a)));
160 }
161 unittest
162 {
163     __m64 A = _mm_setr_pi8(0, -1, -128, -127, 127, 0, 0, 0);
164     byte8 B = cast(byte8) _mm_abs_pi8(A);
165     byte[8] correct =       [0,  1, -128,  127, 127, 0, 0, 0];
166     assert(B.array == correct);
167 }
168 
169 /// Concatenate 16-byte blocks in `a` and `b` into a 32-byte temporary result, shift the result right by `count` bytes, and return the low 16 bytes.
170 __m128i _mm_alignr_epi8(ubyte count)(__m128i a, __m128i b) @trusted
171 {
172     // PERF DMD
173     static if (GDC_with_SSSE3)
174     {
175         return cast(__m128i)__builtin_ia32_palignr128(cast(long2)a, cast(long2)b, count * 8);
176     }
177     else
178     {
179         // Generates palignr since LDC 1.1 -O1
180         // Also generates a single ext instruction on arm64.
181         return cast(__m128i) shufflevector!(byte16, ( 0 + count) % 32,
182                                                     ( 1 + count) % 32,
183                                                     ( 2 + count) % 32,
184                                                     ( 3 + count) % 32,
185                                                     ( 4 + count) % 32,
186                                                     ( 5 + count) % 32,
187                                                     ( 6 + count) % 32,
188                                                     ( 7 + count) % 32,
189                                                     ( 8 + count) % 32,
190                                                     ( 9 + count) % 32,
191                                                     (10 + count) % 32,
192                                                     (11 + count) % 32,
193                                                     (12 + count) % 32,
194                                                     (13 + count) % 32,
195                                                     (14 + count) % 32,
196                                                     (15 + count) % 32)(cast(byte16)a, cast(byte16)b);
197     }
198 }
199 unittest
200 {
201     __m128i A = _mm_setr_epi8(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
202     __m128i B = _mm_setr_epi8(17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32);
203 
204     {
205         byte16 C = cast(byte16)_mm_alignr_epi8!7(A ,B);
206         byte[16] correct = [8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23];
207         assert(C.array == correct);
208     }
209     {
210         byte16 C = cast(byte16)_mm_alignr_epi8!20(A ,B);
211         byte[16] correct = [21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 1, 2, 3, 4];
212         assert(C.array == correct);
213     }
214 }
215 
216 /// Concatenate 8-byte blocks in `a` and `b` into a 16-byte temporary result, shift the result right by `count` bytes, and return the low 8 bytes.
217 __m64 _mm_alignr_pi8(ubyte count)(__m64 a, __m64 b) @trusted
218 {
219     // PERF DMD
220     static if (GDC_with_SSSE3)
221     {
222         return cast(__m64)__builtin_ia32_palignr(cast(long)a, cast(long)b, count * 8);
223     }
224     else
225     {
226         // Note: in LDC x86 this uses a pshufb.
227         // Generates ext in arm64.
228         return cast(__m64) shufflevector!(byte8, (0 + count) % 16,
229                                                  (1 + count) % 16,
230                                                  (2 + count) % 16,
231                                                  (3 + count) % 16,
232                                                  (4 + count) % 16,
233                                                  (5 + count) % 16,
234                                                  (6 + count) % 16,
235                                                  (7 + count) % 16)(cast(byte8)a, cast(byte8)b);
236     }
237 }
238 unittest
239 {
240     __m64 A = _mm_setr_pi8(1, 2, 3, 4, 5, 6, 7, 8);
241     __m64 B = _mm_setr_pi8(17, 18, 19, 20, 21, 22, 23, 24);
242 
243     {
244         byte8 C = cast(byte8)_mm_alignr_pi8!3(A ,B);
245         byte[8] correct = [4, 5, 6, 7, 8, 17, 18, 19];
246         assert(C.array == correct);
247     }
248     {
249         byte8 C = cast(byte8)_mm_alignr_pi8!10(A ,B);
250         byte[8] correct = [19, 20, 21, 22, 23, 24, 1, 2];
251         assert(C.array == correct);
252     }
253 }
254 
255 /// Horizontally add adjacent pairs of 16-bit integers in `a` and `b`, and pack the signed 16-bit results.
256 __m128i _mm_hadd_epi16 (__m128i a, __m128i b) @trusted
257 {
258     // PERF DMD
259     static if (GDC_with_SSSE3)
260     {
261         return cast(__m128i)__builtin_ia32_phaddw128(cast(short8)a, cast(short8)b);
262     }
263     else static if (LDC_with_SSSE3)
264     {
265         return cast(__m128i)__builtin_ia32_phaddw128(cast(short8)a, cast(short8)b);
266     }
267     else static if (LDC_with_ARM64)
268     {
269         return cast(__m128i)vpaddq_s16(cast(short8)a, cast(short8)b);
270     }
271     else
272     {
273         short8 sa = cast(short8)a;
274         short8 sb = cast(short8)b;
275         short8 r;
276         r.ptr[0] = cast(short)(sa.array[0] + sa.array[1]);
277         r.ptr[1] = cast(short)(sa.array[2] + sa.array[3]);
278         r.ptr[2] = cast(short)(sa.array[4] + sa.array[5]);
279         r.ptr[3] = cast(short)(sa.array[6] + sa.array[7]);
280         r.ptr[4] = cast(short)(sb.array[0] + sb.array[1]);
281         r.ptr[5] = cast(short)(sb.array[2] + sb.array[3]);
282         r.ptr[6] = cast(short)(sb.array[4] + sb.array[5]);
283         r.ptr[7] = cast(short)(sb.array[6] + sb.array[7]);
284         return cast(__m128i)r;
285     }
286 }
287 unittest
288 {
289     __m128i A = _mm_setr_epi16(1, -2, 4, 8, 16, 32, -1, -32768);
290     short8 C = cast(short8) _mm_hadd_epi16(A, A);
291     short[8] correct = [ -1, 12, 48, 32767, -1, 12, 48, 32767];
292     assert(C.array == correct);
293 }
294 
295 /// Horizontally add adjacent pairs of 32-bit integers in `a` and `b`, and pack the signed 32-bit results.
296 __m128i _mm_hadd_epi32 (__m128i a, __m128i b) @trusted
297 { 
298     // PERF DMD
299     static if (GDC_with_SSSE3)
300     {
301         return cast(__m128i)__builtin_ia32_phaddd128(cast(int4)a, cast(int4)b);
302     }
303     else static if (LDC_with_SSSE3)
304     {
305         return cast(__m128i)__builtin_ia32_phaddd128(cast(int4)a, cast(int4)b);
306     }
307     else static if (LDC_with_ARM64)
308     {
309         return cast(__m128i)vpaddq_s32(cast(int4)a, cast(int4)b);
310     }
311     else
312     {
313         int4 ia = cast(int4)a;
314         int4 ib = cast(int4)b;
315         int4 r;
316         r.ptr[0] = ia.array[0] + ia.array[1];
317         r.ptr[1] = ia.array[2] + ia.array[3];
318         r.ptr[2] = ib.array[0] + ib.array[1];
319         r.ptr[3] = ib.array[2] + ib.array[3];
320         return cast(__m128i)r;
321     }
322 }
323 unittest
324 {
325     __m128i A = _mm_setr_epi32(1, -2, int.min, -1);
326     __m128i B = _mm_setr_epi32(1, int.max, 4, -4);
327     int4 C = cast(int4) _mm_hadd_epi32(A, B);
328     int[4] correct = [ -1, int.max, int.min, 0 ];
329     assert(C.array == correct);
330 }
331 
332 /// Horizontally add adjacent pairs of 16-bit integers in `a` and `b`, and pack the signed 16-bit results.
333 __m64 _mm_hadd_pi16 (__m64 a, __m64 b) @trusted
334 {
335     // PERF DMD
336     static if (GDC_with_SSSE3)
337     {
338         return cast(__m64) __builtin_ia32_phaddw(cast(short4)a, cast(short4)b);
339     }
340     else static if (LDC_with_ARM64)
341     {
342         return cast(__m64) vpadd_s16(cast(short4)a, cast(short4)b);
343     }
344     else
345     {
346         // LDC x86: generates phaddw since LDC 1.24 -O2.
347         short4 r;
348         short4 sa = cast(short4)a;
349         short4 sb = cast(short4)b;
350         r.ptr[0] = cast(short)(sa.array[0] + sa.array[1]); 
351         r.ptr[1] = cast(short)(sa.array[2] + sa.array[3]);
352         r.ptr[2] = cast(short)(sb.array[0] + sb.array[1]);
353         r.ptr[3] = cast(short)(sb.array[2] + sb.array[3]);
354         return cast(__m64)r;
355     }
356 }
357 unittest
358 {
359     __m64 A = _mm_setr_pi16(1, -2, 4, 8);
360     __m64 B = _mm_setr_pi16(16, 32, -1, -32768);
361     short4 C = cast(short4) _mm_hadd_pi16(A, B);
362     short[4] correct = [ -1, 12, 48, 32767 ];
363     assert(C.array == correct);
364 }
365 
366 
367 __m64 _mm_hadd_pi32 (__m64 a, __m64 b) @trusted
368 {
369     // PERF DMD
370     static if (GDC_with_SSSE3)
371     {
372         return cast(__m64) __builtin_ia32_phaddd(cast(int2)a, cast(int2)b);
373     }
374     else static if (LDC_with_ARM64)
375     {
376         return cast(__m64)vpadd_s32(cast(int2)a, cast(int2)b);
377     }
378     else
379     {
380         // LDC x86: generates phaddd since LDC 1.24 -O2
381         int2 ia = cast(int2)a;
382         int2 ib = cast(int2)b;
383         int2 r;
384         r.ptr[0] = ia.array[0] + ia.array[1];
385         r.ptr[1] = ib.array[0] + ib.array[1];
386         return cast(__m64)r;
387     }
388 }
389 unittest
390 {
391     __m64 A = _mm_setr_pi32(int.min, -1);
392     __m64 B = _mm_setr_pi32(1, int.max);
393     int2 C = cast(int2) _mm_hadd_pi32(A, B);
394     int[2] correct = [ int.max, int.min ];
395     assert(C.array == correct);
396 }
397 
398 
399 /*
400 __m128i _mm_hadds_epi16 (__m128i a, __m128i b)
401 {
402 }
403 unittest
404 {
405 }
406 */
407 /*
408 __m64 _mm_hadds_pi16 (__m64 a, __m64 b)
409 {
410 }
411 unittest
412 {
413 }
414 */
415 
416 /// Horizontally add adjacent pairs of 16-bit integers in `a` and `b`, and pack the signed 16-bit results.
417 __m128i _mm_hsub_epi16 (__m128i a, __m128i b) @trusted
418 {
419     // PERF DMD
420     static if (GDC_with_SSSE3)
421     {
422         return cast(__m128i)__builtin_ia32_phsubw128(cast(short8)a, cast(short8)b);
423     }
424     else static if (LDC_with_SSSE3)
425     {
426         return cast(__m128i)__builtin_ia32_phsubw128(cast(short8)a, cast(short8)b);
427     }
428     else static if (LDC_with_ARM64)
429     {
430         // Produce uzp1 uzp2 sub sequence since LDC 1.8 -O1 
431         short8 sa = cast(short8)a;
432         short8 sb = cast(short8)b;
433         short8 c = shufflevector!(short8, 0, 2, 4, 6, 8, 10, 12, 14)(sa, sb);
434         short8 d = shufflevector!(short8, 1, 3, 5, 7, 9, 11, 13, 15)(sa, sb);
435         return cast(__m128i)(c - d);
436     }
437     else 
438     {
439         short8 sa = cast(short8)a;
440         short8 sb = cast(short8)b;
441         short8 r;
442         r.ptr[0] = cast(short)(sa.array[0] - sa.array[1]);
443         r.ptr[1] = cast(short)(sa.array[2] - sa.array[3]);
444         r.ptr[2] = cast(short)(sa.array[4] - sa.array[5]);
445         r.ptr[3] = cast(short)(sa.array[6] - sa.array[7]);
446         r.ptr[4] = cast(short)(sb.array[0] - sb.array[1]);
447         r.ptr[5] = cast(short)(sb.array[2] - sb.array[3]);
448         r.ptr[6] = cast(short)(sb.array[4] - sb.array[5]);
449         r.ptr[7] = cast(short)(sb.array[6] - sb.array[7]);
450         return cast(__m128i)r;
451     }
452 }
453 unittest
454 {
455     __m128i A = _mm_setr_epi16(short.min, 1, 4, 8, 16, 32, 1, -32768);
456     short8 C = cast(short8) _mm_hsub_epi16(A, A);
457     short[8] correct = [ short.max, -4, -16, -32767, short.max, -4, -16, -32767];
458     assert(C.array == correct);
459 }
460 
461 /// Horizontally add adjacent pairs of 32-bit integers in `a` and `b`, and pack the signed 32-bit results.
462 __m128i _mm_hsub_epi32 (__m128i a, __m128i b) @trusted
463 { 
464     // PERF DMD
465     static if (GDC_with_SSSE3)
466     {
467         return cast(__m128i)__builtin_ia32_phsubd128(cast(int4)a, cast(int4)b);
468     }
469     else static if (LDC_with_SSSE3)
470     {
471         return cast(__m128i)__builtin_ia32_phsubd128(cast(int4)a, cast(int4)b);
472     }
473     else static if (LDC_with_ARM64)
474     {
475         // Produce uzp1 uzp2 sub sequence since LDC 1.8 -O1 
476         int4 ia = cast(int4)a;
477         int4 ib = cast(int4)b;
478         int4 c = shufflevector!(int4, 0, 2, 4, 6)(ia, ib);
479         int4 d = shufflevector!(int4, 1, 3, 5, 7)(ia, ib);
480         return cast(__m128i)(c - d);
481     }
482     else
483     {
484         int4 ia = cast(int4)a;
485         int4 ib = cast(int4)b;
486         int4 r;
487         r.ptr[0] = ia.array[0] - ia.array[1];
488         r.ptr[1] = ia.array[2] - ia.array[3];
489         r.ptr[2] = ib.array[0] - ib.array[1];
490         r.ptr[3] = ib.array[2] - ib.array[3];
491         return cast(__m128i)r;
492     }
493 }
494 unittest
495 {
496     __m128i A = _mm_setr_epi32(1, 2, int.min, 1);
497     __m128i B = _mm_setr_epi32(int.max, -1, 4, 4);
498     int4 C = cast(int4) _mm_hsub_epi32(A, B);
499     int[4] correct = [ -1, int.max, int.min, 0 ];
500     assert(C.array == correct);
501 }
502 
503 __m64 _mm_hsub_pi16 (__m64 a, __m64 b) @trusted
504 {
505     // PERF DMD
506     static if (GDC_with_SSSE3)
507     {
508         return cast(__m64)__builtin_ia32_phsubw(cast(short4)a, cast(short4)b);
509     }
510     else static if (LDC_with_ARM64)
511     {
512         // Produce uzp1 uzp2 sub sequence since LDC 1.3 -O1 
513         short4 sa = cast(short4)a;
514         short4 sb = cast(short4)b;
515         short4 c = shufflevector!(short4, 0, 2, 4, 6)(sa, sb);
516         short4 d = shufflevector!(short4, 1, 3, 5, 7)(sa, sb);
517         return cast(__m64)(c - d);
518     }
519     else
520     {
521         // LDC x86: generates phsubw since LDC 1.24 -O2
522         short4 sa = cast(short4)a;
523         short4 sb = cast(short4)b;
524         short4 r;
525         r.ptr[0] = cast(short)(sa.array[0] - sa.array[1]);
526         r.ptr[1] = cast(short)(sa.array[2] - sa.array[3]);
527         r.ptr[2] = cast(short)(sb.array[0] - sb.array[1]);
528         r.ptr[3] = cast(short)(sb.array[2] - sb.array[3]);
529         return cast(__m64)r;
530     }
531 }
532 unittest
533 {
534     __m64 A = _mm_setr_pi16(short.min, 1, 4, 8);
535     __m64 B = _mm_setr_pi16(16, 32, 1, -32768);
536     short4 C = cast(short4) _mm_hsub_pi16(A, B);
537     short[4] correct = [ short.max, -4, -16, -32767];
538     assert(C.array == correct);
539 }
540 
541 __m64 _mm_hsub_pi32 (__m64 a, __m64 b)
542 {
543     // PERF DMD
544     static if (GDC_with_SSSE3)
545     {
546         return cast(__m64)__builtin_ia32_phsubd(cast(int2)a, cast(int2)b);
547     }
548     else static if (LDC_with_ARM64)
549     {
550         // LDC arm64: generates zip1+zip2+sub sequence since LDC 1.8 -O1
551         int2 ia = cast(int2)a;
552         int2 ib = cast(int2)b;
553         int2 c = shufflevector!(int2, 0, 2)(ia, ib);
554         int2 d = shufflevector!(int2, 1, 3)(ia, ib);
555         return cast(__m64)(c - d);
556     }
557     else
558     {
559         // LDC x86: generates phsubd since LDC 1.24 -O2
560         int2 ia = cast(int2)a;
561         int2 ib = cast(int2)b;
562         int2 r;
563         r.ptr[0] = ia.array[0] - ia.array[1];
564         r.ptr[1] = ib.array[0] - ib.array[1];
565         return cast(__m64)r;
566     }
567 }
568 unittest
569 {
570     __m64 A = _mm_setr_pi32(int.min, 1);
571     __m64 B = _mm_setr_pi32(int.max, -1);
572     int2 C = cast(int2) _mm_hsub_pi32(A, B);
573     int[2] correct = [ int.max, int.min ];
574     assert(C.array == correct);
575 }
576 
577 /*
578 __m128i _mm_hsubs_epi16 (__m128i a, __m128i b)
579 {
580 }
581 unittest
582 {
583 }
584 */
585 /*
586 __m64 _mm_hsubs_pi16 (__m64 a, __m64 b)
587 {
588 }
589 unittest
590 {
591 }
592 */
593 
594 
595 /*
596 __m128i _mm_maddubs_epi16 (__m128i a, __m128i b)
597 {
598 }
599 unittest
600 {
601 }
602 */
603 /*
604 __m64 _mm_maddubs_pi16 (__m64 a, __m64 b)
605 {
606 }
607 unittest
608 {
609 }
610 */
611 /*
612 __m128i _mm_mulhrs_epi16 (__m128i a, __m128i b)
613 {
614 }
615 unittest
616 {
617 }
618 */
619 /*
620 __m64 _mm_mulhrs_pi16 (__m64 a, __m64 b)
621 {
622 }
623 unittest
624 {
625 }
626 */
627 
628 /// Shuffle packed 8-bit integers in `a` according to shuffle control mask in the corresponding 8-bit element of `b`.
629 __m128i _mm_shuffle_epi8 (__m128i a, __m128i b) @trusted
630 {
631     // This is the lovely pshufb.
632     // PERF DMD
633     static if (GDC_with_SSSE3)
634     {
635         return cast(__m128i) __builtin_ia32_pshufb128(cast(byte16) a, cast(byte16) b);
636     }
637     else static if (LDC_with_SSSE3)
638     {
639         return cast(__m128i) __builtin_ia32_pshufb128(cast(byte16) a, cast(byte16) b);
640     }
641     else static if (LDC_with_ARM64)
642     {
643         byte16 bb = cast(byte16)b;
644         byte16 mask;
645         mask = cast(byte)(0x8F);
646         bb = bb & mask;
647         byte16 r = vqtbl1q_s8(cast(byte16)a, bb);
648         return cast(__m128i)r;
649     }
650     else
651     {
652         byte16 r;
653         byte16 ba = cast(byte16)a;
654         byte16 bb = cast(byte16)b;
655         for (int i = 0; i < 16; ++i)
656         {
657             byte s = bb.array[i];
658             r.ptr[i] = (s < 0) ? 0 : ba.array[ s & 15 ];
659         }
660         return cast(__m128i)r;
661     }
662 }
663 unittest
664 {
665     __m128i A = _mm_setr_epi8(15,   14,      13,  12, 11,  10, 9, 8, 7, 6,  5,  4,  3,  2,  1,  0);
666     __m128i B = _mm_setr_epi8(15, -128, 13 + 16, -12, 11, -10, 9, 8, 7, 6, -5,  4,  3, -2,  1,  0);
667     byte16 C = cast(byte16) _mm_shuffle_epi8(A, B);
668     byte[16] correct =         [0,   0,       2,  0,  4,   0, 6, 7, 8, 9,  0, 11, 12,  0, 14, 15];
669     assert(C.array == correct);
670 }
671 
672 /// Shuffle packed 8-bit integers in `a` according to shuffle control mask in the corresponding 8-bit element of `b`.
673 __m64 _mm_shuffle_pi8 (__m64 a, __m64 b)
674 {
675     // PERF DMD
676     static if (GDC_with_SSSE3)
677     {
678         alias ubyte8  =__vector(ubyte[8]);
679         return cast(__m64) __builtin_ia32_pshufb(cast(ubyte8) a, cast(ubyte8) b);
680     }
681     else static if (LDC_with_SSSE3)
682     {
683         // GDC does proper dance to avoid mmx registers, do it manually in LDC since __builtin_ia32_pshufb doesn't exist there
684         __m128i A = to_m128i(a);
685         __m128i index = to_m128i(b);
686         index = index & _mm_set1_epi32(0xF7F7F7F7);
687         return to_m64( cast(__m128i) __builtin_ia32_pshufb128(cast(byte16)A, cast(byte16) index) );
688     }
689     else static if (LDC_with_ARM64)
690     {
691         byte8 bb = cast(byte8)b;
692         byte8 mask;
693         mask = cast(byte)(0x87);
694         bb = bb & mask;
695         __m128i l = to_m128i(a);
696         byte8 r = vtbl1_s8(cast(byte16)l, cast(byte8)bb);
697         return cast(__m64)r;
698     }
699     else
700     {
701         byte8 r;
702         byte8 ba = cast(byte8)a;
703         byte8 bb = cast(byte8)b;
704         for (int i = 0; i < 8; ++i)
705         {
706             byte s = bb.array[i];
707             r.ptr[i] = (s < 0) ? 0 : ba.array[ s & 7 ];
708         }
709         return cast(__m64)r;
710     }
711 }
712 unittest
713 {
714     __m64 A = _mm_setr_pi8(7,  6,  5,  4,      3,  2,  1,  0);
715     __m64 B = _mm_setr_pi8(7,  6, -5,  4,  3 + 8, -2,  1,  0);
716     byte8 C = cast(byte8) _mm_shuffle_pi8(A, B);
717     byte[8] correct =    [0,  1,  0,  3,      4,  0,  6,  7];
718     assert(C.array == correct);
719 }
720 
721 /// Negate packed 16-bit integers in `a` when the corresponding signed 16-bit integer in `b` is negative.
722 /// Elements in result are zeroed out when the corresponding element in `b` is zero.
723 __m128i _mm_sign_epi16 (__m128i a, __m128i b)
724 {
725     // PERF DMD
726     static if (GDC_with_SSSE3)
727     {
728         return cast(__m128i) __builtin_ia32_psignw128(cast(short8)a, cast(short8)b);
729     }
730     else static if (LDC_with_SSSE3)
731     {
732         return cast(__m128i) __builtin_ia32_psignw128(cast(short8)a, cast(short8)b);       
733     }
734     else
735     {
736         // LDC arm64: 5 instructions
737         __m128i mask = _mm_srai_epi16(b, 15);
738         __m128i zeromask = _mm_cmpeq_epi16(b, _mm_setzero_si128());
739         return _mm_andnot_si128(zeromask, _mm_xor_si128(_mm_add_epi16(a, mask), mask));
740     }
741 }
742 unittest
743 {
744     __m128i A = _mm_setr_epi16(-2, -1, 0, 1,  2, short.min, short.min, short.min);
745     __m128i B = _mm_setr_epi16(-1,  0,-1, 1, -2,       -50,         0,        50);
746     short8 C = cast(short8) _mm_sign_epi16(A, B);
747     short[8] correct =        [ 2,  0, 0, 1, -2, short.min,         0, short.min];
748     assert(C.array == correct);
749 }
750 
751 /// Negate packed 32-bit integers in `a` when the corresponding signed 32-bit integer in `b` is negative. 
752 /// Elements in result are zeroed out when the corresponding element in `b` is zero.
753 __m128i _mm_sign_epi32 (__m128i a, __m128i b)
754 {
755     // PERF DMD
756     static if (GDC_with_SSSE3)
757     {
758         return cast(__m128i) __builtin_ia32_psignd128(cast(short8)a, cast(short8)b);
759     }
760     else static if (LDC_with_SSSE3)
761     {
762         return cast(__m128i) __builtin_ia32_psignd128(cast(short8)a, cast(short8)b);
763     }
764     else
765     {
766         __m128i mask = _mm_srai_epi32(b, 31);
767         __m128i zeromask = _mm_cmpeq_epi32(b, _mm_setzero_si128());
768         return _mm_andnot_si128(zeromask, _mm_xor_si128(_mm_add_epi32(a, mask), mask));
769     }
770 }
771 unittest
772 {
773     __m128i A = _mm_setr_epi32(-2, -1,  0, int.max);
774     __m128i B = _mm_setr_epi32(-1,  0, -1, 1);
775     int4 C = cast(int4) _mm_sign_epi32(A, B);
776     int[4] correct =          [ 2,  0, 0, int.max];
777     assert(C.array == correct);
778 }
779 
780 /// Negate packed 8-bit integers in `a` when the corresponding signed 8-bit integer in `b` is negative. 
781 /// Elements in result are zeroed out when the corresponding element in `b` is zero.
782 __m128i _mm_sign_epi8 (__m128i a, __m128i b)
783 {
784     // PERF DMD
785     static if (GDC_with_SSSE3)
786     {
787         return cast(__m128i) __builtin_ia32_psignb128(cast(byte16)a, cast(byte16)b);
788     }
789     else static if (LDC_with_SSSE3)
790     {
791         return cast(__m128i) __builtin_ia32_psignb128(cast(byte16)a, cast(byte16)b);
792     }
793     else
794     {
795         __m128i mask = _mm_cmplt_epi8(b, _mm_setzero_si128()); // extend sign bit
796         __m128i zeromask = _mm_cmpeq_epi8(b, _mm_setzero_si128());
797         return _mm_andnot_si128(zeromask, _mm_xor_si128(_mm_add_epi8(a, mask), mask));
798     }
799 }
800 unittest
801 {
802     __m128i A = _mm_setr_epi8(-2, -1, 0, 1,  2, byte.min, byte.min, byte.min, -1,  0,-1, 1, -2,      -50,        0,       50);
803     __m128i B = _mm_setr_epi8(-1,  0,-1, 1, -2,      -50,        0,       50, -2, -1, 0, 1,  2, byte.min, byte.min, byte.min);
804     byte16  C = cast(byte16) _mm_sign_epi8(A, B);
805     byte[16] correct =       [ 2,  0, 0, 1, -2, byte.min,        0, byte.min,  1,  0, 0, 1, -2,       50,        0,      -50];
806     assert(C.array == correct);
807 }
808 
809 /// Negate packed 16-bit integers in `a`  when the corresponding signed 16-bit integer in `b` is negative.
810 /// Element in result are zeroed out when the corresponding element in `b` is zero.
811 __m64 _mm_sign_pi16 (__m64 a, __m64 b)
812 {
813     return to_m64( _mm_sign_epi16( to_m128i(a), to_m128i(b)) );
814 }
815 unittest
816 {
817     __m64 A = _mm_setr_pi16( 2, short.min, short.min, short.min);
818     __m64 B = _mm_setr_pi16(-2,       -50,         0,        50);
819     short4 C = cast(short4) _mm_sign_pi16(A, B);
820     short[4] correct =     [-2, short.min,         0, short.min];
821     assert(C.array == correct);
822 }
823 
824 /// Negate packed 32-bit integers in `a`  when the corresponding signed 32-bit integer in `b` is negative.
825 /// Element in result are zeroed out when the corresponding element in `b` is zero.
826 __m64 _mm_sign_pi32 (__m64 a, __m64 b)
827 {
828     return to_m64( _mm_sign_epi32( to_m128i(a), to_m128i(b)) );
829 }
830 unittest
831 {
832     __m64 A = _mm_setr_pi32(-2, -100);
833     __m64 B = _mm_setr_pi32(-1,  0);
834     int2 C = cast(int2) _mm_sign_pi32(A, B);
835     int[2] correct =          [ 2,  0];
836     assert(C.array == correct);
837 }
838 
839 /// Negate packed 8-bit integers in `a` when the corresponding signed 8-bit integer in `b` is negative. 
840 /// Elements in result are zeroed out when the corresponding element in `b` is zero.
841 __m64 _mm_sign_pi8 (__m64 a, __m64 b)
842 {
843     return to_m64( _mm_sign_epi8( to_m128i(a), to_m128i(b)) );
844 }
845 unittest
846 {
847     __m64 A = _mm_setr_pi8(-2, -1, 0, 1,  2, byte.min, byte.min, byte.min);
848     __m64 B = _mm_setr_pi8(-1,  0,-1, 1, -2,      -50,        0,       50);
849     byte8  C = cast(byte8) _mm_sign_pi8(A, B);
850     byte[8] correct =     [ 2,  0, 0, 1, -2, byte.min,        0, byte.min];
851     assert(C.array == correct);
852 }
853 
854 
855 
856 /*
857 
858 
859 Note: LDC 1.0 to 1.27 have the following builtins:
860 
861 pragma(LDC_intrinsic, "llvm.x86.ssse3.phadd.sw.128")
862     short8 __builtin_ia32_phaddsw128(short8, short8) pure @safe;
863 
864 pragma(LDC_intrinsic, "llvm.x86.ssse3.phsub.sw.128")
865     short8 __builtin_ia32_phsubsw128(short8, short8) pure @safe;
866 
867 pragma(LDC_intrinsic, "llvm.x86.ssse3.pmadd.ub.sw.128")
868     short8 __builtin_ia32_pmaddubsw128(byte16, byte16) pure @safe;
869 
870 pragma(LDC_intrinsic, "llvm.x86.ssse3.pmul.hr.sw.128")
871     short8 __builtin_ia32_pmulhrsw128(short8, short8) pure @safe;
872 
873 */