1 /**
2 * Copyright: Copyright Auburn Sounds 2016-2018.
3 * License:   $(LINK2 http://www.boost.org/LICENSE_1_0.txt, Boost License 1.0)
4 * Authors:   Guillaume Piolat
5 */
6 module inteli.emmintrin;
7 
8 public import inteli.types;
9 public import inteli.xmmintrin; // SSE2 includes SSE1
10 
11 import inteli.internals;
12 
13 nothrow @nogc:
14 
15 // SSE2 instructions
16 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#techs=SSE2
17 
18 __m128i _mm_add_epi16 (__m128i a, __m128i b) pure @safe
19 {
20     return cast(__m128i)(cast(short8)a + cast(short8)b);
21 }
22 
23 __m128i _mm_add_epi32 (__m128i a, __m128i b) pure @safe
24 {
25     return cast(__m128i)(cast(int4)a + cast(int4)b);
26 }
27 
28 __m128i _mm_add_epi64 (__m128i a, __m128i b) pure @safe
29 {
30     return cast(__m128i)(cast(long2)a + cast(long2)b);
31 }
32 
33 __m128i _mm_add_epi8 (__m128i a, __m128i b) pure @safe
34 {
35     return cast(__m128i)(cast(byte16)a + cast(byte16)b);
36 }
37 
38 __m128d _mm_add_sd(__m128d a, __m128d b) pure @safe
39 {
40     a[0] += b[0];
41     return a;
42 }
43 unittest
44 {
45     __m128d a = [1.5, -2.0];
46     a = _mm_add_sd(a, a);
47     assert(a.array == [3.0, -2.0]);
48 }
49 
50 
51 __m128d _mm_add_pd (__m128d a, __m128d b) pure @safe
52 {
53     return a + b;
54 }
55 unittest
56 {
57     __m128d a = [1.5, -2.0];
58     a = _mm_add_pd(a, a);
59     assert(a.array == [3.0, -4.0]);
60 }
61 
62 // MMXREG: _mm_add_si64
63 
64 version(LDC)
65 {
66     alias _mm_adds_epi16 = __builtin_ia32_paddsw128;
67 }
68 else
69 {
70     __m128i _mm_adds_epi16(__m128i a, __m128i b) pure @trusted
71     {
72         short[8] res;
73         short8 sa = cast(short8)a;
74         short8 sb = cast(short8)b;
75         foreach(i; 0..8)
76             res[i] = saturateSignedIntToSignedShort(sa.array[i] + sb.array[i]);
77         return _mm_loadu_si128(cast(int4*)res.ptr);
78     }
79 }
80 unittest
81 {
82     short8 res = cast(short8) _mm_adds_epi16(_mm_set_epi16(7, 6, 5, 4, 3, 2, 1, 0),
83                                              _mm_set_epi16(7, 6, 5, 4, 3, 2, 1, 0));
84     static immutable short[8] correctResult = [0, 2, 4, 6, 8, 10, 12, 14];
85     assert(res.array == correctResult);
86 }
87 
88 version(LDC)
89 {
90     alias _mm_adds_epi8 = __builtin_ia32_paddsb128;
91 }
92 else
93 {
94     __m128i _mm_adds_epi8(__m128i a, __m128i b) pure @trusted
95     {
96         byte[16] res;
97         byte16 sa = cast(byte16)a;
98         byte16 sb = cast(byte16)b;
99         foreach(i; 0..16)
100             res[i] = saturateSignedWordToSignedByte(sa.array[i] + sb.array[i]);
101         return _mm_loadu_si128(cast(int4*)res.ptr);
102     }
103 }
104 unittest
105 {
106     byte16 res = cast(byte16) _mm_adds_epi8(_mm_set_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0),
107                                             _mm_set_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0));
108     static immutable byte[16] correctResult = [0, 2, 4, 6, 8, 10, 12, 14,
109                                                16, 18, 20, 22, 24, 26, 28, 30];
110     assert(res.array == correctResult);
111 }
112 
113 version(LDC)
114 {
115     alias _mm_adds_epu8 = __builtin_ia32_paddusb128;
116 }
117 else
118 {
119     __m128i _mm_adds_epu8(__m128i a, __m128i b) pure @trusted
120     {
121         ubyte[16] res;
122         byte16 sa = cast(byte16)a;
123         byte16 sb = cast(byte16)b;
124         foreach(i; 0..16)
125             res[i] = saturateSignedWordToUnsignedByte(cast(ubyte)(sa.array[i]) + cast(ubyte)(sb.array[i]));
126         return _mm_loadu_si128(cast(int4*)res.ptr);
127     }
128 }
129 
130 version(LDC)
131 {
132     alias _mm_adds_epu16 = __builtin_ia32_paddusw128;
133 }
134 else
135 {
136     __m128i _mm_adds_epu16(__m128i a, __m128i b) pure @trusted
137     {
138         ushort[8] res;
139         short8 sa = cast(short8)a;
140         short8 sb = cast(short8)b;
141         foreach(i; 0..8)
142             res[i] = saturateSignedIntToUnsignedShort(cast(ushort)(sa.array[i]) + cast(ushort)(sb.array[i]));
143         return _mm_loadu_si128(cast(int4*)res.ptr);
144     }
145 }
146 
147 __m128d _mm_and_pd (__m128d a, __m128d b) pure @safe
148 {
149     return cast(__m128d)( cast(__m128i)a & cast(__m128i)b );
150 }
151 
152 __m128i _mm_and_si128 (__m128i a, __m128i b) pure @safe
153 {
154     return a & b;
155 }
156 
157 __m128d _mm_andnot_pd (__m128d a, __m128d b) pure @safe
158 {
159     return cast(__m128d)( (~cast(__m128i)a) & cast(__m128i)b );
160 }
161 
162 __m128i _mm_andnot_si128 (__m128i a, __m128i b) pure @safe
163 {
164     return (~a) & b;
165 }
166 
167 version(LDC)
168 {
169     pragma(LDC_intrinsic, "llvm.x86.sse2.pavg.w")
170         short8 _mm_avg_epu16(short8, short8) pure @safe;
171 
172     pragma(LDC_intrinsic, "llvm.x86.sse2.pavg.b")
173         byte16 _mm_avg_epu8(byte16, byte16) pure @safe;
174 }
175 // TODO
176 
177 
178 // TODO: __m128i _mm_bslli_si128 (__m128i a, int imm8)
179 // TODO: __m128i _mm_bsrli_si128 (__m128i a, int imm8)
180 
181 __m128 _mm_castpd_ps (__m128d a) pure @safe
182 {
183     return cast(__m128)a;
184 }
185 
186 __m128i _mm_castpd_si128 (__m128d a) pure @safe
187 {
188     return cast(__m128i)a;
189 }
190 
191 __m128d _mm_castps_pd (__m128 a) pure @safe
192 {
193     return cast(__m128d)a;
194 }
195 
196 __m128i _mm_castps_si128 (__m128 a) pure @safe
197 {
198     return cast(__m128i)a;
199 }
200 
201 __m128d _mm_castsi128_pd (__m128i a) pure @safe
202 {
203     return cast(__m128d)a;
204 }
205 
206 __m128 _mm_castsi128_ps (__m128i a) pure @safe
207 {
208     return cast(__m128)a;
209 }
210 
211 version(LDC)
212 {
213     alias _mm_clflush = __builtin_ia32_clflush;
214 }
215 // TODO
216 
217 version(LDC)
218 {
219     pragma(LDC_intrinsic, "llvm.x86.sse2.cmp.pd")
220         double2 __builtin_ia32_cmppd(double2, double2, byte) pure @safe;
221 }
222 // TODO
223 
224 __m128i _mm_cmpeq_epi16 (__m128i a, __m128i b) pure @safe
225 {
226     return cast(__m128i) equalMask!short8(cast(short8)a, cast(short8)b);
227 }
228 
229 __m128i _mm_cmpeq_epi32 (__m128i a, __m128i b) pure @safe
230 {
231     return equalMask!__m128i(a, b);
232 }
233 
234 __m128i _mm_cmpeq_epi8 (__m128i a, __m128i b) pure @safe
235 {
236     return cast(__m128i) equalMask!byte16(cast(byte16)a, cast(byte16)b);
237 }
238 unittest
239 {
240     __m128i A = _mm_setr_epi8(1, 2, 3, 1, 2, 1, 1, 2, 3, 2, 1, 0, 0, 1, 2, 1);
241     __m128i B = _mm_setr_epi8(2, 2, 1, 2, 3, 1, 2, 3, 2, 1, 0, 0, 1, 2, 1, 1);
242 
243     byte16 C = cast(byte16) _mm_cmpeq_epi8(A, B);
244     static immutable byte[16] correct =
245                              [0,-1, 0, 0, 0,-1, 0, 0, 0, 0, 0,-1, 0, 0, 0, -1];
246 
247     __m128i D = _mm_cmpeq_epi8(A, B);
248     assert(C.array == correct);
249 }
250 
251 
252 version(LDC)
253 {
254     __m128d _mm_cmpeq_pd (__m128d a, __m128d b) pure @safe
255     {
256         return __builtin_ia32_cmppd(a, b, 0);
257     }
258 
259     __m128d _mm_cmpeq_sd (__m128d a, __m128d b) pure @safe
260     {
261         return __builtin_ia32_cmpsd(a, b, 0);
262     }
263 
264     __m128d _mm_cmpge_pd (__m128d a, __m128d b) pure @safe
265     {
266         return __builtin_ia32_cmppd(b, a, 2);
267     }
268 
269     __m128d _mm_cmpge_sd (__m128d a, __m128d b) pure @safe
270     {
271         return __builtin_ia32_cmpsd(b, a, 2);
272     }
273 }
274 // TODO
275 
276 
277 // TODO
278 /+__m128i _mm_cmpgt_epi16 (__m128i a, __m128i b) pure @safe
279 {
280     return cast(__m128i)( cast(short8)a > cast(short8)b );
281 }
282 
283 __m128i _mm_cmpgt_epi32 (__m128i a, __m128i b) pure @safe
284 {
285     return cast(__m128i)( cast(int4)a > cast(int4)b );
286 }
287 
288 __m128i _mm_cmpgt_epi8 (__m128i a, __m128i b) pure @safe
289 {
290     return cast(__m128i)( cast(byte16)a > cast(byte16)b );
291 }+/
292 
293 version(LDC)
294 {
295     __m128d _mm_cmpgt_pd (__m128d a, __m128d b) pure @safe
296     {
297         return __builtin_ia32_cmppd(b, a, 1);
298     }
299 
300     __m128d _mm_cmpgt_sd (__m128d a, __m128d b) pure @safe
301     {
302         return __builtin_ia32_cmpsd(b, a, 1);
303     }
304 
305     __m128d _mm_cmple_pd (__m128d a, __m128d b) pure @safe
306     {
307         return __builtin_ia32_cmppd(a, b, 2);
308     }
309 
310     __m128d _mm_cmple_sd (__m128d a, __m128d b) pure @safe
311     {
312         return __builtin_ia32_cmpsd(a, b, 2);
313     }
314 }
315 // TODO
316 
317 // TODO
318 /+__m128i _mm_cmplt_epi16 (__m128i a, __m128i b) pure @safe
319 {
320     return cast(__m128i)( cast(short8)a < cast(short8)b );
321 }
322 
323 __m128i _mm_cmplt_epi32 (__m128i a, __m128i b) pure @safe
324 {
325     return cast(__m128i)( cast(int4)a < cast(int4)b );
326 }
327 
328 __m128i _mm_cmplt_epi8 (__m128i a, __m128i b) pure @safe
329 {
330     return cast(__m128i)( cast(byte8)a < cast(byte8)b );
331 }+/
332 
333 version(LDC)
334 {
335     __m128d _mm_cmplt_pd (__m128d a, __m128d b) pure @safe
336     {
337         return __builtin_ia32_cmppd(a, b, 1);
338     }
339 
340     __m128d _mm_cmplt_sd (__m128d a, __m128d b) pure @safe
341     {
342         return __builtin_ia32_cmpsd(a, b, 1);
343     }
344 
345     __m128d _mm_cmpneq_pd (__m128d a, __m128d b) pure @safe
346     {
347         return __builtin_ia32_cmppd(a, b, 4);
348     }
349 
350     __m128d _mm_cmpneq_sd (__m128d a, __m128d b) pure @safe
351     {
352         return __builtin_ia32_cmpsd(a, b, 4);
353     }
354 
355     __m128d _mm_cmpnge_pd (__m128d a, __m128d b) pure @safe
356     {
357         return __builtin_ia32_cmppd(b, a, 6);
358     }
359 
360     __m128d _mm_cmpnge_sd (__m128d a, __m128d b) pure @safe
361     {
362         return __builtin_ia32_cmpsd(b, a, 6);
363     }
364 
365     __m128d _mm_cmpngt_pd (__m128d a, __m128d b) pure @safe
366     {
367         return __builtin_ia32_cmppd(b, a, 5);
368     }
369 
370     __m128d _mm_cmpngt_sd (__m128d a, __m128d b) pure @safe
371     {
372         return __builtin_ia32_cmpsd(b, a, 5);
373     }
374 
375     __m128d _mm_cmpnle_pd (__m128d a, __m128d b) pure @safe
376     {
377         return __builtin_ia32_cmppd(a, b, 6);
378     }
379 
380     __m128d _mm_cmpnle_sd (__m128d a, __m128d b) pure @safe
381     {
382         return __builtin_ia32_cmpsd(a, b, 6);
383     }
384 
385     __m128d _mm_cmpnlt_pd (__m128d a, __m128d b) pure @safe
386     {
387         return __builtin_ia32_cmppd(a, b, 5);
388     }
389 
390     __m128d _mm_cmpnlt_sd (__m128d a, __m128d b) pure @safe
391     {
392         return __builtin_ia32_cmpsd(a, b, 5);
393     }
394 
395     __m128d _mm_cmpord_pd (__m128d a, __m128d b) pure @safe
396     {
397         return __builtin_ia32_cmppd(a, b, 7);
398     }
399 
400     __m128d _mm_cmpord_sd (__m128d a, __m128d b) pure @safe
401     {
402         return __builtin_ia32_cmpsd(a, b, 7);
403     }
404 
405     __m128d _mm_cmpunord_pd (__m128d a, __m128d b) pure @safe
406     {
407         return __builtin_ia32_cmppd(a, b, 3);
408     }
409 
410     __m128d _mm_cmpunord_sd (__m128d a, __m128d b) pure @safe
411     {
412         return __builtin_ia32_cmpsd(a, b, 3);
413     }
414 }
415 // TODO
416 
417 version(LDC)
418 {
419     alias _mm_comieq_sd = __builtin_ia32_comisdeq;
420     alias _mm_comige_sd = __builtin_ia32_comisdge;
421     alias _mm_comigt_sd = __builtin_ia32_comisdgt;
422     alias _mm_comile_sd = __builtin_ia32_comisdle;
423     alias _mm_comilt_sd = __builtin_ia32_comisdlt;
424     alias _mm_comineq_sd = __builtin_ia32_comisdneq;
425 }
426 // TODO
427 
428 // TODO: alias _mm_cvtepi32_pd = __builtin_ia32_cvtdq2pd;
429 
430 version(LDC)
431 {
432     alias _mm_cvtepi32_ps = __builtin_ia32_cvtdq2ps;
433     alias _mm_cvtpd_epi32 = __builtin_ia32_cvtpd2dq;
434 }
435 
436 // MMXREG: _mm_cvtpd_pi32
437 version(LDC)
438 {
439     alias _mm_cvtpd_ps = __builtin_ia32_cvtpd2ps;
440 // MMXREG: _mm_cvtpi32_pd
441     alias _mm_cvtps_epi32 = __builtin_ia32_cvtps2dq;
442 }
443 // TODO
444 
445 // TODO: alias _mm_cvtps_pd = __builtin_ia32_cvtps2pd;
446 
447 double _mm_cvtsd_f64 (__m128d a) pure @safe
448 {
449     return extractelement!(double2, 0)(a);
450 }
451 
452 version(LDC)
453 {
454     alias _mm_cvtsd_si32 = __builtin_ia32_cvtsd2si;
455     alias _mm_cvtsd_si64 = __builtin_ia32_cvtsd2si64;
456     alias _mm_cvtsd_si64x = _mm_cvtsd_si64;
457 }
458 // TODO
459 
460 version(LDC)
461 {
462     alias _mm_cvtsd_ss = __builtin_ia32_cvtsd2ss;
463 }
464 // TODO
465 
466 int _mm_cvtsi128_si32 (__m128i a) pure @safe
467 {
468     return a[0];
469 }
470 
471 long _mm_cvtsi128_si64 (__m128i a) pure @safe
472 {
473     long2 la = cast(long2)a;
474     return la[0];
475 }
476 alias _mm_cvtsi128_si64x = _mm_cvtsi128_si64;
477 
478 version(LDC)
479 {
480     // this LLVM intrinsics seems to still be there
481     pragma(LDC_intrinsic, "llvm.x86.sse2.cvtsi2sd")
482         double2 _mm_cvtsi32_sd(double2, int) pure @safe;
483 }
484 else
485 {
486     __m128d _mm_cvtsi32_sd(__m128d v, int x) pure @safe
487     {
488         v[0] = cast(double)x;
489         return v;
490     }
491 }
492 unittest
493 {
494     __m128d a = _mm_cvtsi32_sd(_mm_set1_pd(0.0f), 42);
495     assert(a.array == [42.0, 0]);
496 }
497 
498 __m128i _mm_cvtsi32_si128 (int a) pure @safe
499 {
500     int4 r = [0, 0, 0, 0];
501     r[0] = a;
502     return r;
503 }
504 
505 // Note: on macOS, using "llvm.x86.sse2.cvtsi642sd" was buggy
506 __m128d _mm_cvtsi64_sd(__m128d v, long x) pure @safe
507 {
508     v[0] = cast(double)x;
509     return v;
510 }
511 unittest
512 {
513     __m128d a = _mm_cvtsi64_sd(_mm_set1_pd(0.0f), 42);
514     assert(a.array == [42.0, 0]);
515 }
516 
517 __m128i _mm_cvtsi64_si128 (long a) pure @safe
518 {
519     long2 r = [0, 0];
520     r[0] = a;
521     return cast(__m128i)(r);
522 }
523 
524 alias _mm_cvtsi64x_sd = _mm_cvtsi64_sd;
525 alias _mm_cvtsi64x_si128 = _mm_cvtsi64_si128;
526 
527 version(LDC)
528 {
529     pragma(LDC_intrinsic, "llvm.x86.sse2.cvtss2sd")
530         double2 _mm_cvtss_sd(double2, float4) pure @safe;
531 }
532 else
533 {
534     double2 _mm_cvtss_sd(double2 v, float4 x) pure @safe
535     {
536         v[0] = x[0];
537         return v;
538     }
539 }
540 unittest
541 {
542     __m128d a = _mm_cvtss_sd(_mm_set1_pd(0.0f), _mm_set1_ps(42.0f));
543     assert(a.array == [42.0, 0]);
544 }
545 
546 version(LDC)
547 {
548     alias _mm_cvttpd_epi32 = __builtin_ia32_cvttpd2dq;
549     //MMXREG: _mm_cvttpd_pi32
550     alias _mm_cvttps_epi32 = __builtin_ia32_cvttps2dq;
551     alias _mm_cvttsd_si32 = __builtin_ia32_cvttsd2si;
552     alias _mm_cvttsd_si64 = __builtin_ia32_cvttsd2si64;
553     alias _mm_cvttsd_si64x = _mm_cvttsd_si64;
554 }
555 // TODO
556 
557 
558 
559 __m128d _mm_div_ps(__m128d a, __m128d b)
560 {
561     return a / b;
562 }
563 
564 __m128d _mm_div_sd(__m128d a, __m128d b) pure @safe
565 {
566     a[0] /= b[0];
567     return a;
568 }
569 unittest
570 {
571     __m128d a = [2.0, 4.5];
572     a = _mm_div_sd(a, a);
573     assert(a.array == [1.0, 4.5]);
574 }
575 
576 int _mm_extract_epi16(int imm8)(__m128i a) pure @safe
577 {
578     return shufflevector!(short8, imm8)(a);
579 }
580 
581 __m128i _mm_insert_epi16(int imm8)(__m128i a, int i) pure @safe
582 {
583     return insertelement!(short8, imm8)(a, i);
584 }
585 
586 version(LDC)
587 {
588     alias _mm_lfence = __builtin_ia32_lfence;
589 }
590 // TODO
591 
592 
593 __m128d _mm_load_pd (const(double) * mem_addr) pure
594 {
595     __m128d* aligned = cast(__m128d*)mem_addr;
596     return *aligned;
597 }
598 
599 __m128d _mm_load_pd1 (const(double)* mem_addr) pure
600 {
601     double[2] arr = [*mem_addr, *mem_addr];
602     return loadUnaligned!(double2)(&arr[0]);
603 }
604 
605 __m128d _mm_load_sd (const(double)* mem_addr) pure @safe
606 {
607     double2 r = [0, 0];
608     r[0] = *mem_addr;
609     return r;
610 }
611 unittest
612 {
613     double x = -42;
614     __m128d a = _mm_load_sd(&x);
615     assert(a.array == [-42.0, 0.0]);
616 }
617 
618 __m128i _mm_load_si128 (const(__m128i)* mem_addr) pure @trusted
619 {
620     return *mem_addr;
621 }
622 
623 alias _mm_load1_pd = _mm_load_pd1;
624 
625 __m128d _mm_loadh_pd (__m128d a, const(double)* mem_addr) pure @safe
626 {
627     a[1] = *mem_addr;
628     return a;
629 }
630 
631 // Note: strange signature since the memory doesn't have to aligned
632 __m128i _mm_loadl_epi64 (const(__m128i)* mem_addr) pure @safe
633 {
634     auto pLong = cast(const(long)*)mem_addr;
635     long2 r = [0, 0];
636     r[0] = *pLong;
637     return cast(__m128i)(r);
638 }
639 
640 __m128d _mm_loadl_pd (__m128d a, const(double)* mem_addr) pure @safe
641 {
642     a[0] = *mem_addr;
643     return a;
644 }
645 
646 __m128d _mm_loadr_pd (const(double)* mem_addr) pure @trusted
647 {
648     __m128d a = _mm_load_pd(mem_addr);
649     return shufflevector!(__m128d, 1, 0)(a, a);
650 }
651 
652 __m128d _mm_loadu_pd (const(double)* mem_addr) pure @safe
653 {
654     return loadUnaligned!(double2)(mem_addr);
655 }
656 
657 __m128i _mm_loadu_si128 (const(__m128i)* mem_addr) pure @trusted
658 {
659     return loadUnaligned!(__m128i)(cast(int*)mem_addr);
660 }
661 
662 version(LDC)
663 {
664     alias _mm_madd_epi16 = __builtin_ia32_pmaddwd128;
665 
666     alias _mm_maskmoveu_si128 = __builtin_ia32_maskmovdqu;
667 
668     pragma(LDC_intrinsic, "llvm.x86.sse2.pmaxs.w")
669         short8 __builtin_ia32_pmaxsw128(short8, short8) pure @safe;
670     alias _mm_max_epi16 = __builtin_ia32_pmaxsw128;
671 
672     pragma(LDC_intrinsic, "llvm.x86.sse2.pmaxu.b")
673         byte16 __builtin_ia32_pmaxub128(byte16, byte16) pure @safe;
674     alias _mm_max_epu8 = __builtin_ia32_pmaxub128;
675 
676     alias _mm_max_pd = __builtin_ia32_maxpd;
677     alias _mm_max_sd = __builtin_ia32_maxsd;
678 
679     alias _mm_mfence = __builtin_ia32_mfence;
680 
681     pragma(LDC_intrinsic, "llvm.x86.sse2.pmins.w")
682         short8 __builtin_ia32_pminsw128(short8, short8) pure @safe;
683     alias _mm_min_epi16 = __builtin_ia32_pminsw128;
684 
685     pragma(LDC_intrinsic, "llvm.x86.sse2.pminu.b")
686         byte16 __builtin_ia32_pminub128(byte16, byte16) pure @safe;
687     alias _mm_min_epu8 = __builtin_ia32_pminub128;
688 
689     alias _mm_min_pd = __builtin_ia32_minpd;
690     alias _mm_min_sd = __builtin_ia32_minsd;
691 }
692 // TODO
693 
694 __m128i _mm_move_epi64 (__m128i a) pure @safe
695 {
696     long2 result = [ 0, 0 ];
697     long2 la = cast(long2) a;
698     result[0] = la[0];
699     return cast(__m128i)(result);
700 }
701 unittest
702 {
703     long2 A = [13, 47];
704     long2 B = cast(long2) _mm_move_epi64( cast(__m128i)A );
705     long[2] correct = [13, 0];
706     assert(B.array == correct);
707 }
708 
709 __m128d _mm_move_sd (__m128d a, __m128d b) pure @safe
710 {
711     b[1] = a[1];
712     return b;
713 }
714 unittest
715 {
716     double2 A = [13.0, 47.0];
717     double2 B = [34.0, 58.0];
718     double2 C = _mm_move_sd(A, B);
719     double[2] correct = [34.0, 47.0];
720     assert(C.array == correct);
721 }
722 
723 version(LDC)
724 {
725     alias _mm_movemask_epi8 = __builtin_ia32_pmovmskb128;
726     alias _mm_movemask_pd = __builtin_ia32_movmskpd;
727 }
728 
729 // MMXREG: _mm_movepi64_pi64
730 // MMXREG: __m128i _mm_movpi64_epi64 (__m64 a)
731 
732 version(LDC)
733 {
734     alias _mm_mul_epu32 = __builtin_ia32_pmuludq128;
735 }
736 // TODO
737 
738 __m128d _mm_mul_pd(__m128d a, __m128d b) pure @safe
739 {
740     return a * b;
741 }
742 unittest
743 {
744     __m128d a = [-2.0, 1.5];
745     a = _mm_mul_pd(a, a);
746     assert(a.array == [4.0, 2.25]);
747 }
748 
749 __m128d _mm_mul_sd(__m128d a, __m128d b) pure @safe
750 {
751     a[0] *= b[0];
752     return a;
753 }
754 unittest
755 {
756     __m128d a = [-2.0, 1.5];
757     a = _mm_mul_sd(a, a);
758     assert(a.array == [4.0, 1.5]);
759 }
760 
761 
762 // MMXREG: _mm_mul_su32
763 
764 version(LDC)
765 {
766     alias _mm_mulhi_epi16 = __builtin_ia32_pmulhw128;
767     alias _mm_mulhi_epu16 = __builtin_ia32_pmulhuw128;
768 }
769 // TODO
770 
771 __m128i _mm_mullo_epi16 (__m128i a, __m128i b)
772 {
773     return cast(__m128i)(cast(short8)a * cast(short8)b);
774 }
775 
776 __m128d _mm_or_pd (__m128d a, __m128d b) pure @safe
777 {
778     return cast(__m128d)( cast(__m128i)a | cast(__m128i)b );
779 }
780 
781 __m128i _mm_or_si128 (__m128i a, __m128i b) pure @safe
782 {
783     return a | b;
784 }
785 
786 version(LDC)
787 {
788     alias _mm_packs_epi32 = __builtin_ia32_packssdw128;
789     alias _mm_packs_epi16 = __builtin_ia32_packsswb128;
790     alias _mm_packus_epi16 = __builtin_ia32_packuswb128;
791 }
792 // TODO
793 
794 version(LDC)
795 {
796     alias _mm_pause = __builtin_ia32_pause;
797 }
798 // TODO
799 
800 version(LDC)
801 {
802     alias _mm_sad_epu8 = __builtin_ia32_psadbw128;
803 }
804 // TODO
805 
806 __m128i _mm_set_epi16 (short e7, short e6, short e5, short e4, short e3, short e2, short e1, short e0) pure @trusted
807 {
808     short[8] result = [e0, e1, e2, e3, e4, e5, e6, e7];
809     return cast(__m128i) loadUnaligned!(short8)(result.ptr);
810 }
811 unittest
812 {
813     __m128i A = _mm_set_epi16(7, 6, 5, 4, 3, 2, 1, 0);
814     short8 B = cast(short8) A;
815     foreach(i; 0..8)
816         assert(B.array[i] == i);
817 }
818 
819 __m128i _mm_set_epi32 (int e3, int e2, int e1, int e0) pure @trusted
820 {
821     int[4] result = [e0, e1, e2, e3];
822     return loadUnaligned!(int4)(result.ptr);
823 }
824 unittest
825 {
826     __m128i A = _mm_set_epi32(3, 2, 1, 0);
827     foreach(i; 0..4)
828         assert(A.array[i] == i);
829 }
830 
831 __m128i _mm_set_epi64x (long e1, long e0) pure @trusted
832 {
833     long[2] result = [e0, e1];
834     return cast(__m128i)( loadUnaligned!(long2)(result.ptr) );
835 }
836 unittest
837 {
838     __m128i A = _mm_set_epi64x(1234, 5678);
839     long2 B = cast(long2) A;
840     assert(B.array[0] == 5678);
841     assert(B.array[1] == 1234);
842 }
843 
844 __m128i _mm_set_epi8 (byte e15, byte e14, byte e13, byte e12,
845                       byte e11, byte e10, byte e9, byte e8,
846                       byte e7, byte e6, byte e5, byte e4,
847                       byte e3, byte e2, byte e1, byte e0) pure @trusted
848 {
849     byte[16] result = [e0, e1,  e2,  e3,  e4,  e5,  e6, e7,
850                      e8, e9, e10, e11, e12, e13, e14, e15];
851     return cast(__m128i)( loadUnaligned!(byte16)(result.ptr) );
852 }
853 
854 __m128d _mm_set_pd (double e1, double e0) pure @trusted
855 {
856     double[2] result = [e0, e1];
857     return loadUnaligned!(double2)(result.ptr);
858 }
859 
860 __m128d _mm_set_pd1 (double a) pure @trusted
861 {
862     double[2] result = [a, a];
863     return loadUnaligned!(double2)(result.ptr);
864 }
865 
866 __m128d _mm_set_sd (double a) pure @trusted
867 {
868     double[2] result = [a, 0];
869     return loadUnaligned!(double2)(result.ptr);
870 }
871 
872 __m128i _mm_set1_epi16 (short a) pure @trusted
873 {
874     short[8] result = [a, a, a, a, a, a, a, a];
875     return cast(__m128i)( loadUnaligned!(short8)(result.ptr) );
876 }
877 
878 __m128i _mm_set1_epi32 (int a) pure @trusted
879 {
880     int[4] result = [a, a, a, a];
881     return loadUnaligned!(int4)(result.ptr);
882 }
883 unittest
884 {
885     __m128 a = _mm_set1_ps(-1.0f);
886     __m128 b = cast(__m128) _mm_set1_epi32(0x7fffffff);
887     assert(_mm_and_ps(a, b).array == [1.0f, 1, 1, 1]);
888 }
889 
890 __m128i _mm_set1_epi64x (long a) pure @trusted
891 {
892     long[2] result = [a, a];
893     return cast(__m128i)( loadUnaligned!(long2)(result.ptr) );
894 }
895 
896 __m128i _mm_set1_epi8 (char a) pure @trusted
897 {
898     byte[16] result = [a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a];
899     return cast(__m128i)( loadUnaligned!(byte16)(result.ptr) );
900 }
901 
902 alias _mm_set1_pd = _mm_set_pd1;
903 
904 __m128i _mm_setr_epi16 (short e7, short e6, short e5, short e4, short e3, short e2, short e1, short e0) pure @trusted
905 {
906     short[8] result = [e7, e6, e5, e4, e3, e2, e1, e0];
907     return cast(__m128i)( loadUnaligned!(short8)(result.ptr) );
908 }
909 
910 __m128i _mm_setr_epi32 (int e3, int e2, int e1, int e0) pure @trusted
911 {
912     int[4] result = [e3, e2, e1, e0];
913     return cast(__m128i)( loadUnaligned!(int4)(result.ptr) );
914 }
915 
916 __m128i _mm_setr_epi64 (long e1, long e0) pure @trusted
917 {
918     long[2] result = [e1, e0];
919     return cast(__m128i)( loadUnaligned!(long2)(result.ptr) );
920 }
921 
922 __m128i _mm_setr_epi8 (char e15, char e14, char e13, char e12,
923                        char e11, char e10, char e9, char e8,
924                        char e7, char e6, char e5, char e4,
925                        char e3, char e2, char e1, char e0) pure @trusted
926 {
927     byte[16] result = [e15, e14, e13, e12, e11, e10, e9, e8,
928                       e7,  e6,  e5,  e4,  e3,  e2, e1, e0];
929     return cast(__m128i)( loadUnaligned!(byte16)(result.ptr) );
930 }
931 
932 __m128d _mm_setr_pd (double e1, double e0) pure @trusted
933 {
934     double[2] result = [e1, e0];
935     return loadUnaligned!(double2)(result.ptr);
936 }
937 
938 __m128d _mm_setzero_pd () pure @trusted
939 {
940     double[2] result = [0.0, 0.0];
941     return loadUnaligned!(double2)(result.ptr);
942 }
943 
944 __m128i _mm_setzero_si128() pure @trusted
945 {
946     int[4] result = [0, 0, 0, 0];
947     return cast(__m128i)( loadUnaligned!(int4)(result.ptr) );
948 }
949 
950 __m128i _mm_shuffle_epi32(int imm8)(__m128i a) pure @safe
951 {
952     return shufflevector!(int4, (imm8 >> 0) & 3,
953                                 (imm8 >> 2) & 3,
954                                 (imm8 >> 4) & 3,
955                                 (imm8 >> 6) & 3)(a, a);
956 }
957 
958 __m128d _mm_shuffle_pd (int imm8)(__m128d a, __m128d b) pure @safe
959 {
960     return shufflevector!(double, 0 + ( (imm8 >> 0) & 1 ),
961                                   2 + ( (imm8 >> 1) & 1 ))(a, b);
962 }
963 
964 __m128i _mm_shufflehi_epi16(int imm8)(__m128i a) pure @safe
965 {
966     return shufflevector!(int4, 4 + ( (imm8 >> 0) & 3 ),
967                                 4 + ( (imm8 >> 2) & 3 ),
968                                 4 + ( (imm8 >> 4) & 3 ),
969                                 4 + ( (imm8 >> 6) & 3 ))(a, a);
970 }
971 
972 __m128i _mm_shufflelo_epi16(int imm8)(__m128i a) pure @safe
973 {
974     return shufflevector!(int4, ( (imm8 >> 0) & 3 ),
975                                 ( (imm8 >> 2) & 3 ),
976                                 ( (imm8 >> 4) & 3 ),
977                                 ( (imm8 >> 6) & 3 ))(a, a);
978 }
979 
980 version(LDC)
981 {
982     alias _mm_sll_epi32 = __builtin_ia32_pslld128;
983     alias _mm_sll_epi64 = __builtin_ia32_psllq128;
984     alias _mm_sll_epi16 = __builtin_ia32_psllw128;
985     alias _mm_slli_epi32 = __builtin_ia32_pslldi128;
986     alias _mm_slli_epi64 = __builtin_ia32_psllqi128;
987     alias _mm_slli_epi16 = __builtin_ia32_psllwi128;
988 }
989 // TODO
990 
991 __m128i _mm_slli_si128(ubyte imm8)(__m128i op) pure @safe
992 {
993     static if (imm8 & 0xF0)
994         return _mm_setzero_si128();
995     else
996         return shufflevector!(byte16,
997         16 - imm8, 17 - imm8, 18 - imm8, 19 - imm8, 20 - imm8, 21 - imm8, 22 - imm8, 23 - imm8,
998         24 - imm8, 25 - imm8, 26 - imm8, 27 - imm8, 28 - imm8, 29 - imm8, 30 - imm8, 31 - imm8)
999         (_mm_setzero_si128(), op);
1000 }
1001 
1002 version(LDC)
1003 {
1004     __m128d _mm_sqrt_pd (__m128d a) pure @safe
1005     {
1006         return __builtin_ia32_sqrtpd(a);
1007     }
1008 }
1009 // TODO
1010 
1011 version(LDC)
1012 {
1013     __m128d _mm_sqrt_sd (__m128d a) pure @safe
1014     {
1015         return __builtin_ia32_sqrtsd(a);
1016     }
1017 }
1018 // TODO
1019 
1020 
1021 version(LDC)
1022 {
1023     alias _mm_sra_epi16  = __builtin_ia32_psraw128;
1024     alias _mm_sra_epi32  = __builtin_ia32_psrad128;
1025     alias _mm_srai_epi16 = __builtin_ia32_psrawi128;
1026     alias _mm_srai_epi32 = __builtin_ia32_psradi128;
1027 
1028     alias _mm_srl_epi16  = __builtin_ia32_psrlw128;
1029     alias _mm_srl_epi32  = __builtin_ia32_psrld128;
1030     alias _mm_srl_epi64  = __builtin_ia32_psrlq128;
1031     alias _mm_srli_epi16 = __builtin_ia32_psrlwi128;
1032     alias _mm_srli_epi32 = __builtin_ia32_psrldi128;
1033     alias _mm_srli_epi64 = __builtin_ia32_psrlqi128;
1034 }
1035 // TODO
1036 
1037 __m128i _mm_srli_si128(ubyte imm8)(__m128i op) pure @safe
1038 {
1039     static if (imm8 & 0xF0)
1040         return _mm_setzero_si128();
1041     else
1042         return cast(__m128i) shufflevector!(byte16,
1043                                             imm8+0, imm8+1, imm8+2, imm8+3, imm8+4, imm8+5, imm8+6, imm8+7,
1044                                             imm8+8, imm8+9, imm8+10, imm8+11, imm8+12, imm8+13, imm8+14, imm8+15)
1045                                            (cast(byte16) op, cast(byte16)_mm_setzero_si128());
1046 }
1047 
1048 // Note: this is a bonus intrinsic
1049 __m128 _mm_srli_si128(ubyte imm8)(__m128 op) @safe
1050 {
1051     return cast(__m128)_mm_srli_si128!imm8(cast(__m128i)op);
1052 }
1053 unittest
1054 {
1055     // test that cast works at all
1056     __m128 A = cast(__m128) _mm_set1_epi32(0x3F800000);
1057     assert(A.array == [1.0f, 1.0f, 1.0f, 1.0f]);
1058 
1059     // test _mm_srli_si128 for __m128i
1060     assert(_mm_srli_si128!4(_mm_set_epi32(4, 3, 2, 1)).array == [2, 3, 4, 0]);
1061     assert(_mm_srli_si128!8(_mm_set_ps(4.0f, 3.0f, 2.0f, 1.0f)).array == [3.0f, 4.0f, 0, 0]);
1062 }
1063 
1064 __m128d _mm_srli_si128(ubyte imm8)(__m128d op) pure @safe
1065 {
1066     return cast(__m128d) _mm_srli_si128!imm8(cast(__m128i)op);
1067 }
1068 
1069 void _mm_store_pd (double* mem_addr, __m128d a) pure
1070 {
1071     __m128d* aligned = cast(__m128d*)mem_addr;
1072     *aligned = a;
1073 }
1074 
1075 void _mm_store_pd1 (double* mem_addr, __m128d a) pure
1076 {
1077     __m128d* aligned = cast(__m128d*)mem_addr;
1078     *aligned = shufflevector!(double2, 0, 0)(a, a);
1079 }
1080 
1081 void _mm_store_sd (double* mem_addr, __m128d a) pure @safe
1082 {
1083     *mem_addr = extractelement!(double2, 0)(a);
1084 }
1085 
1086 void _mm_store_si128 (__m128i* mem_addr, __m128i a) pure @safe
1087 {
1088     *mem_addr = a;
1089 }
1090 
1091 alias _mm_store1_pd = _mm_store_pd1;
1092 
1093 void _mm_storeh_pd (double* mem_addr, __m128d a) pure @safe
1094 {
1095     *mem_addr = extractelement!(double2, 1)(a);
1096 }
1097 
1098 void _mm_storel_epi64 (__m128i* mem_addr, __m128i a) pure @safe
1099 {
1100     long* dest = cast(long*)mem_addr;
1101     *dest = extractelement!(long2, 0)(cast(long2)a);
1102 }
1103 
1104 void _mm_storel_pd (double* mem_addr, __m128d a) pure @safe
1105 {
1106     *mem_addr = extractelement!(double2, 0)(a);
1107 }
1108 
1109 void _mm_storer_pd (double* mem_addr, __m128d a) pure
1110 {
1111     __m128d* aligned = cast(__m128d*)mem_addr;
1112     *aligned = shufflevector!(double2, 1, 0)(a, a);
1113 }
1114 
1115 void _mm_storeu_pd (double* mem_addr, __m128d a) pure @safe
1116 {
1117     storeUnaligned!double2(a, mem_addr);
1118 }
1119 
1120 void _mm_storeu_si128 (__m128i* mem_addr, __m128i a) pure @safe
1121 {
1122     storeUnaligned!__m128i(a, cast(int*)mem_addr);
1123 }
1124 
1125 // TODO: _mm_stream_pd
1126 // TODO: _mm_stream_si128
1127 // TODO: _mm_stream_si32
1128 // TODO: _mm_stream_si64
1129 
1130 __m128i _mm_sub_epi16(__m128i a, __m128i b) pure @safe
1131 {
1132     return cast(__m128i)(cast(short8)a - cast(short8)b);
1133 }
1134 
1135 __m128i _mm_sub_epi32(__m128i a, __m128i b) pure @safe
1136 {
1137     return cast(__m128i)(cast(int4)a - cast(int4)b);
1138 }
1139 
1140 __m128i _mm_sub_epi64(__m128i a, __m128i b) pure @safe
1141 {
1142     return cast(__m128i)(cast(long2)a - cast(long2)b);
1143 }
1144 
1145 __m128i _mm_sub_epi8(__m128i a, __m128i b) pure @safe
1146 {
1147     return cast(__m128i)(cast(byte16)a - cast(byte16)b);
1148 }
1149 
1150 __m128d _mm_sub_pd(__m128d a, __m128d b) pure @safe
1151 {
1152     return a - b;
1153 }
1154 
1155 __m128d _mm_sub_sd(__m128d a, __m128d b) pure @safe
1156 {
1157     a[0] -= b[0];
1158     return a;
1159 }
1160 unittest
1161 {
1162     __m128d a = [1.5, -2.0];
1163     a = _mm_sub_sd(a, a);
1164     assert(a.array == [0.0, -2.0]);
1165 }
1166 
1167 
1168 // MMXREG: _mm_sub_si64
1169 
1170 version(LDC)
1171 {
1172     alias _mm_subs_epi16 = __builtin_ia32_psubsw128;
1173     alias _mm_subs_epi8 = __builtin_ia32_psubsb128;
1174     alias _mm_subs_epu16 = __builtin_ia32_psubusw128;
1175     alias _mm_subs_epu8 = __builtin_ia32_psubusb128;
1176 
1177     alias _mm_ucomieq_sd = __builtin_ia32_ucomisdeq;
1178     alias _mm_ucomige_sd = __builtin_ia32_ucomisdge;
1179     alias _mm_ucomigt_sd = __builtin_ia32_ucomisdgt;
1180     alias _mm_ucomile_sd = __builtin_ia32_ucomisdle;
1181     alias _mm_ucomilt_sd = __builtin_ia32_ucomisdlt;
1182     alias _mm_ucomineq_sd = __builtin_ia32_ucomisdneq;
1183 }
1184 // TODO
1185 
1186 __m128d _mm_undefined_pd() pure @safe
1187 {
1188     __m128d result = void;
1189     return result;
1190 }
1191 __m128i _mm_undefined_si128() pure @safe
1192 {
1193     __m128i result = void;
1194     return result;
1195 }
1196 
1197 __m128i _mm_unpackhi_epi16 (__m128i a, __m128i b) pure @safe
1198 {
1199     return cast(__m128i) shufflevector!(short8, 4, 12, 5, 13, 6, 14, 7, 15)
1200                                        (cast(short8)a, cast(short8)b);
1201 }
1202 
1203 __m128i _mm_unpackhi_epi32 (__m128i a, __m128i b) pure @safe
1204 {
1205     return shufflevector!(int4, 2, 6, 3, 7)(cast(int4)a, cast(int4)b);
1206 }
1207 
1208 __m128i _mm_unpackhi_epi64 (__m128i a, __m128i b) pure @safe
1209 {
1210     return cast(__m128i) shufflevector!(long2, 1, 3)(cast(long2)a, cast(long2)b);
1211 }
1212 
1213 __m128i _mm_unpackhi_epi8 (__m128i a, __m128i b) pure @safe
1214 {
1215     return cast(__m128i)shufflevector!(byte16, 8,  24,  9, 25, 10, 26, 11, 27,
1216                                                12, 28, 13, 29, 14, 30, 15, 31)
1217                                                (cast(byte16)a, cast(byte16)b);
1218 }
1219 
1220 __m128d _mm_unpackhi_pd (__m128d a, __m128d b) pure @safe
1221 {
1222     return shufflevector!(__m128d, 1, 3)(a, b);
1223 }
1224 
1225 __m128i _mm_unpacklo_epi16 (__m128i a, __m128i b) pure @safe
1226 {
1227     return cast(__m128i) shufflevector!(short8, 0, 8, 1, 9, 2, 10, 3, 11)
1228                                        (cast(short8)a, cast(short8)b);
1229 }
1230 
1231 __m128i _mm_unpacklo_epi32 (__m128i a, __m128i b) pure @safe
1232 {
1233     return shufflevector!(int4, 0, 4, 1, 6)
1234                          (cast(int4)a, cast(int4)b);
1235 }
1236 
1237 __m128i _mm_unpacklo_epi64 (__m128i a, __m128i b) pure @safe
1238 {
1239     return cast(__m128i) shufflevector!(long2, 0, 2)
1240                                        (cast(long2)a, cast(long2)b);
1241 }
1242 
1243 __m128i _mm_unpacklo_epi8 (__m128i a, __m128i b) pure @safe
1244 {
1245     return cast(__m128i) shufflevector!(byte16, 0, 16, 1, 17, 2, 18, 3, 19,
1246                                                 4, 20, 5, 21, 6, 22, 7, 23)
1247                                        (cast(byte16)a, cast(byte16)b);
1248 }
1249 
1250 __m128d _mm_unpacklo_pd (__m128d a, __m128d b) pure @safe
1251 {
1252     return shufflevector!(__m128d, 0, 2)(a, b);
1253 }
1254 
1255 __m128d _mm_xor_pd (__m128d a, __m128d b) pure @safe
1256 {
1257     return cast(__m128d)(cast(__m128i)a ^ cast(__m128i)b);
1258 }
1259 
1260 __m128i _mm_xor_si128 (__m128i a, __m128i b) pure @safe
1261 {
1262     return a ^ b;
1263 }
1264 
1265 unittest
1266 {
1267     // distance between two points in 4D
1268     float distance(float[4] a, float[4] b) nothrow @nogc
1269     {
1270         __m128 va = _mm_loadu_ps(a.ptr);
1271         __m128 vb = _mm_loadu_ps(b.ptr);
1272         __m128 diffSquared = _mm_sub_ps(va, vb);
1273         diffSquared = _mm_mul_ps(diffSquared, diffSquared);
1274         __m128 sum = _mm_add_ps(diffSquared, _mm_srli_si128!8(diffSquared));
1275         sum = _mm_add_ps(sum, _mm_srli_si128!4(sum));
1276         return _mm_cvtss_f32(_mm_sqrt_ss(sum));
1277     }
1278     assert(distance([0, 2, 0, 0], [0, 0, 0, 0]) == 2);
1279 }