1 /**
2 * SSE4.1 intrinsics.
3 *
4 * Copyright: Guillaume Piolat 2021.
5 *            Johan Engelen 2021.
6 * License:   $(LINK2 http://www.boost.org/LICENSE_1_0.txt, Boost License 1.0)
7 */
8 module inteli.smmintrin;
9 
10 // SSE4.1 instructions
11 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#techs=SSE4_1
12 // Note: this header will work whether you have SSE4.1 enabled or not.
13 // With LDC, use "dflags-ldc": ["-mattr=+sse4.1"] or equivalent to actively
14 // generate SSE4.1 instructions.
15 
16 public import inteli.types;
17 import inteli.internals;
18 
19 // smmintrin pulls in all previous instruction set intrinsics.
20 public import inteli.tmmintrin;
21 
22 nothrow @nogc:
23 
24 enum int _MM_FROUND_TO_NEAREST_INT = 0x00; /// SSE4.1 rounding modes
25 enum int _MM_FROUND_TO_NEG_INF     = 0x01; /// ditto
26 enum int _MM_FROUND_TO_POS_INF     = 0x02; /// ditto
27 enum int _MM_FROUND_TO_ZERO        = 0x03; /// ditto
28 enum int _MM_FROUND_CUR_DIRECTION  = 0x04; /// ditto
29 enum int _MM_FROUND_RAISE_EXC      = 0x00; /// ditto
30 enum int _MM_FROUND_NO_EXC         = 0x08; /// ditto
31 
32 enum int _MM_FROUND_NINT      = (_MM_FROUND_RAISE_EXC | _MM_FROUND_TO_NEAREST_INT);
33 enum int _MM_FROUND_FLOOR     = (_MM_FROUND_RAISE_EXC | _MM_FROUND_TO_NEG_INF);
34 enum int _MM_FROUND_CEIL      = (_MM_FROUND_RAISE_EXC | _MM_FROUND_TO_POS_INF);
35 enum int _MM_FROUND_TRUNC     = (_MM_FROUND_RAISE_EXC | _MM_FROUND_TO_ZERO);
36 enum int _MM_FROUND_RINT      = (_MM_FROUND_RAISE_EXC | _MM_FROUND_CUR_DIRECTION);
37 enum int _MM_FROUND_NEARBYINT = (_MM_FROUND_NO_EXC    | _MM_FROUND_CUR_DIRECTION);
38 
39 /// Blend packed 16-bit integers from `a` and `b` using control mask `imm8`, and store the results.
40 // Note: changed signature, GDC needs a compile-time value for imm8.
41 __m128i _mm_blend_epi16(int imm8)(__m128i a, __m128i b) @trusted
42 {
43     // PERF DMD
44     // PERF LDC: clang has access to __builtin_ia32_pblendw128 but we do not, for some reason.
45     // Not sure how to get vblendw
46     static if (GDC_with_SSE41)
47     {
48         return cast(__m128i) __builtin_ia32_pblendw128(cast(short8)a, cast(short8)b, imm8);
49     }
50     else 
51     {
52         short8 r;
53         short8 sa = cast(short8)a;
54         short8 sb = cast(short8)b;
55         for (int n = 0; n < 8; ++n)
56         {
57             r.ptr[n] = (imm8 & (1 << n)) ? sb.array[n] : sa.array[n];
58         }
59         return cast(__m128i)r;
60     }
61 }
62 unittest
63 {
64     __m128i A = _mm_setr_epi16(0, 1,  2,  3,  4,  5,  6,  7);
65     __m128i B = _mm_setr_epi16(8, 9, 10, 11, 12, 13, 14, 15);
66     short8 C = cast(short8) _mm_blend_epi16!147(A, B); // 10010011
67     short[8] correct =        [8, 9,  2,  3, 12,  5,  6, 15];
68     assert(C.array == correct);
69 }
70 
71 
72 /// Blend packed double-precision (64-bit) floating-point elements from a and b using control mask imm8, and store the results in dst.
73 // Note: changed signature, GDC needs a compile-time value for imm8.
74 __m128d _mm_blend_pd (__m128d a, __m128d b, const int imm8) @trusted
75 {
76     // PERF DMD
77     static if (GDC_with_SSE41)
78     {
79         return cast(__m128i) __builtin_ia32_blendpd(cast(short8)a, cast(short8)b, imm8);
80     }
81     else 
82     {
83         // LDC x86: blendpd since LDC 1.1 -02, uses blendps after LDC 1.12
84         double2 r;
85         for (int n = 0; n < 2; ++n)
86         {
87             r.ptr[n] = (imm8 & (1 << n)) ? b.array[n] : a.array[n];
88         }
89         return cast(__m128d)r;
90     }
91 }
92 unittest
93 {
94     __m128d A = _mm_setr_pd(0, 1);
95     __m128d B = _mm_setr_pd(8, 9);
96     double2 C = _mm_blend_pd(A, B, 2); // 10
97     double[2] correct =    [0, 9];
98     assert(C.array == correct);
99 }
100 
101 
102 /// Blend packed single-precision (32-bit) floating-point elements from a and b using control mask imm8, and store the results in dst.
103 // Note: changed signature, GDC needs a compile-time value for imm8.
104 __m128 _mm_blend_ps(int imm8)(__m128 a, __m128 b) @trusted
105 {
106     // PERF DMD
107     static assert(imm8 >= 0 && imm8 < 16);
108     static if (GDC_with_SSE41)
109     {
110         return __builtin_ia32_blendps(a, b, imm8);
111     }
112     else version(LDC)
113     {
114         // LDC x86: generates blendps since LDC 1.1 -O2
115         //   arm64: pretty good, two instructions worst case
116         return shufflevector!(float4, (imm8 & 1) ? 4 : 0,
117                                       (imm8 & 2) ? 5 : 1,
118                                       (imm8 & 4) ? 6 : 2,
119                                       (imm8 & 8) ? 7 : 3)(a, b);
120     }
121     else
122     {
123         __m128 r;
124         for (int n = 0; n < 4; ++n)
125         {
126             r.ptr[n] = (imm8 & (1 << n)) ? b.array[n] : a.array[n];
127         }
128         return r;
129     }
130 }
131 unittest
132 {
133     __m128 A = _mm_setr_ps(0, 1,  2,  3);
134     __m128 B = _mm_setr_ps(8, 9, 10, 11);
135     float4 C = cast(float4) _mm_blend_ps!13(A, B); // 1101
136     float[4] correct =    [8, 1, 10, 11];
137     assert(C.array == correct);
138 }
139 
140 
141 /*
142 /// Blend packed 8-bit integers from a and b using mask, and store the results in dst.
143 __m128i _mm_blendv_epi8 (__m128i a, __m128i b, __m128i mask) @trusted
144 {
145 }
146 unittest
147 {
148 }
149 */
150 
151 /*
152 /// Blend packed double-precision (64-bit) floating-point elements from a and b using mask, and store the results in dst.
153 __m128d _mm_blendv_pd (__m128d a, __m128d b, __m128d mask) @trusted
154 {
155 }
156 unittest
157 {
158 }
159 */
160 
161 /*
162 /// Blend packed single-precision (32-bit) floating-point elements from a and b using mask, and store the results in dst.
163 __m128 _mm_blendv_ps (__m128 a, __m128 b, __m128 mask) @trusted
164 {
165 }
166 unittest
167 {
168 }
169 */
170 
171 /*
172 /// Round the packed double-precision (64-bit) floating-point elements in a up to an integer value, and store the results as packed double-precision floating-point elements in dst.
173 __m128d _mm_ceil_pd (__m128d a) @trusted
174 {
175 }
176 unittest
177 {
178 }
179 */
180 
181 /*
182 /// Round the packed single-precision (32-bit) floating-point elements in a up to an integer value, and store the results as packed single-precision floating-point elements in dst.
183 __m128 _mm_ceil_ps (__m128 a) @trusted
184 {
185 }
186 unittest
187 {
188 }
189 */
190 
191 /*
192 /// Round the lower double-precision (64-bit) floating-point element in b up to an integer value, store the result as a double-precision floating-point element in the lower element of dst, and copy the upper element from a to the upper element of dst.
193 __m128d _mm_ceil_sd (__m128d a, __m128d b) @trusted
194 {
195 }
196 unittest
197 {
198 }
199 */
200 
201 /*
202 /// Round the lower single-precision (32-bit) floating-point element in b up to an integer value, store the result as a single-precision floating-point element in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.
203 __m128 _mm_ceil_ss (__m128 a, __m128 b) @trusted
204 {
205 }
206 unittest
207 {
208 }
209 */
210 
211 /*
212 /// Compare packed 64-bit integers in a and b for equality, and store the results in dst.
213 __m128i _mm_cmpeq_epi64 (__m128i a, __m128i b) @trusted
214 {
215 }
216 unittest
217 {
218 }
219 */
220 
221 /*
222 /// Sign extend packed 16-bit integers in a to packed 32-bit integers, and store the results in dst.
223 __m128i _mm_cvtepi16_epi32 (__m128i a) @trusted
224 {
225 }
226 unittest
227 {
228 }
229 */
230 
231 /*
232 /// Sign extend packed 16-bit integers in a to packed 64-bit integers, and store the results in dst.
233 __m128i _mm_cvtepi16_epi64 (__m128i a) @trusted
234 {
235 }
236 unittest
237 {
238 }
239 */
240 
241 /*
242 /// Sign extend packed 32-bit integers in a to packed 64-bit integers, and store the results in dst.
243 __m128i _mm_cvtepi32_epi64 (__m128i a) @trusted
244 {
245 }
246 unittest
247 {
248 }
249 */
250 
251 /*
252 /// Sign extend packed 8-bit integers in a to packed 16-bit integers, and store the results in dst.
253 __m128i _mm_cvtepi8_epi16 (__m128i a) @trusted
254 {
255 }
256 unittest
257 {
258 }
259 */
260 
261 /*
262 /// Sign extend packed 8-bit integers in a to packed 32-bit integers, and store the results in dst.
263 __m128i _mm_cvtepi8_epi32 (__m128i a) @trusted
264 {
265 }
266 unittest
267 {
268 }
269 */
270 
271 /*
272 /// Sign extend packed 8-bit integers in the low 8 bytes of a to packed 64-bit integers, and store the results in dst.
273 __m128i _mm_cvtepi8_epi64 (__m128i a) @trusted
274 {
275 }
276 unittest
277 {
278 }
279 */
280 
281 /*
282 /// Zero extend packed unsigned 16-bit integers in a to packed 32-bit integers, and store the results in dst.
283 __m128i _mm_cvtepu16_epi32 (__m128i a) @trusted
284 {
285 }
286 unittest
287 {
288 }
289 */
290 
291 /*
292 /// Zero extend packed unsigned 16-bit integers in a to packed 64-bit integers, and store the results in dst.
293 __m128i _mm_cvtepu16_epi64 (__m128i a) @trusted
294 {
295 }
296 unittest
297 {
298 }
299 */
300 
301 /*
302 /// Zero extend packed unsigned 32-bit integers in a to packed 64-bit integers, and store the results in dst.
303 __m128i _mm_cvtepu32_epi64 (__m128i a) @trusted
304 {
305 }
306 unittest
307 {
308 }
309 */
310 
311 /*
312 /// Zero extend packed unsigned 8-bit integers in a to packed 16-bit integers, and store the results in dst.
313 __m128i _mm_cvtepu8_epi16 (__m128i a) @trusted
314 {
315 }
316 unittest
317 {
318 }
319 */
320 
321 /*
322 /// Zero extend packed unsigned 8-bit integers in a to packed 32-bit integers, and store the results in dst.
323 __m128i _mm_cvtepu8_epi32 (__m128i a) @trusted
324 {
325 }
326 unittest
327 {
328 }
329 */
330 
331 /*
332 /// Zero extend packed unsigned 8-bit integers in the low 8 byte sof a to packed 64-bit integers, and store the results in dst.
333 __m128i _mm_cvtepu8_epi64 (__m128i a) @trusted
334 {
335 }
336 unittest
337 {
338 }
339 */
340 
341 /*
342 /// Conditionally multiply the packed double-precision (64-bit) floating-point elements in a and b using the high 4 bits in imm8, sum the four products, and conditionally store the sum in dst using the low 4 bits of imm8.
343 __m128d _mm_dp_pd (__m128d a, __m128d b, const int imm8) @trusted
344 {
345 }
346 unittest
347 {
348 }
349 */
350 
351 /*
352 /// Conditionally multiply the packed single-precision (32-bit) floating-point elements in a and b using the high 4 bits in imm8, sum the four products, and conditionally store the sum in dst using the low 4 bits of imm8.
353 __m128 _mm_dp_ps (__m128 a, __m128 b, const int imm8) @trusted
354 {
355 }
356 unittest
357 {
358 }
359 */
360 
361 /*
362 /// Extract a 32-bit integer from a, selected with imm8, and store the result in dst.
363 int _mm_extract_epi32 (__m128i a, const int imm8) @trusted
364 {
365 }
366 unittest
367 {
368 }
369 */
370 
371 /*
372 /// Extract a 64-bit integer from a, selected with imm8, and store the result in dst.
373 __int64 _mm_extract_epi64 (__m128i a, const int imm8) @trusted
374 {
375 }
376 unittest
377 {
378 }
379 */
380 
381 /*
382 /// Extract an 8-bit integer from a, selected with imm8, and store the result in the lower element of dst.
383 int _mm_extract_epi8 (__m128i a, const int imm8) @trusted
384 {
385 }
386 unittest
387 {
388 }
389 */
390 
391 /*
392 /// Extract a single-precision (32-bit) floating-point element from a, selected with imm8, and store the result in dst.
393 int _mm_extract_ps (__m128 a, const int imm8) @trusted
394 {
395 }
396 unittest
397 {
398 }
399 */
400 
401 /*
402 /// Round the packed double-precision (64-bit) floating-point elements in a down to an integer value, and store the results as packed double-precision floating-point elements in dst.
403 __m128d _mm_floor_pd (__m128d a) @trusted
404 {
405 }
406 unittest
407 {
408 }
409 */
410 
411 /*
412 /// Round the packed single-precision (32-bit) floating-point elements in a down to an integer value, and store the results as packed single-precision floating-point elements in dst.
413 __m128 _mm_floor_ps (__m128 a) @trusted
414 {
415 }
416 unittest
417 {
418 }
419 */
420 
421 /*
422 /// Round the lower double-precision (64-bit) floating-point element in b down to an integer value, store the result as a double-precision floating-point element in the lower element of dst, and copy the upper element from a to the upper element of dst.
423 __m128d _mm_floor_sd (__m128d a, __m128d b) @trusted
424 {
425 }
426 unittest
427 {
428 }
429 */
430 
431 /*
432 /// Round the lower single-precision (32-bit) floating-point element in b down to an integer value, store the result as a single-precision floating-point element in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.
433 __m128 _mm_floor_ss (__m128 a, __m128 b) @trusted
434 {
435 }
436 unittest
437 {
438 }
439 */
440 
441 /*
442 /// Copy a to dst, and insert the 32-bit integer i into dst at the location specified by imm8.
443 __m128i _mm_insert_epi32 (__m128i a, int i, const int imm8) @trusted
444 {
445 }
446 unittest
447 {
448 }
449 */
450 
451 /*
452 /// Copy a to dst, and insert the 64-bit integer i into dst at the location specified by imm8.
453 __m128i _mm_insert_epi64 (__m128i a, __int64 i, const int imm8) @trusted
454 {
455 }
456 unittest
457 {
458 }
459 */
460 
461 /*
462 /// Copy a to dst, and insert the lower 8-bit integer from i into dst at the location specified by imm8.
463 __m128i _mm_insert_epi8 (__m128i a, int i, const int imm8) @trusted
464 {
465 }
466 unittest
467 {
468 }
469 */
470 
471 /*
472 /// Copy a to tmp, then insert a single-precision (32-bit) floating-point element from b into tmp using the control in imm8. Store tmp to dst using the mask in imm8 (elements are zeroed out when the corresponding bit is set).
473 __m128 _mm_insert_ps (__m128 a, __m128 b, const int imm8) @trusted
474 {
475 }
476 unittest
477 {
478 }
479 */
480 
481 /*
482 /// Compare packed signed 32-bit integers in a and b, and store packed maximum values in dst.
483 __m128i _mm_max_epi32 (__m128i a, __m128i b) @trusted
484 {
485 }
486 unittest
487 {
488 }
489 */
490 
491 /*
492 /// Compare packed signed 8-bit integers in a and b, and store packed maximum values in dst.
493 __m128i _mm_max_epi8 (__m128i a, __m128i b) @trusted
494 {
495 }
496 unittest
497 {
498 }
499 */
500 
501 /*
502 /// Compare packed unsigned 16-bit integers in a and b, and store packed maximum values in dst.
503 __m128i _mm_max_epu16 (__m128i a, __m128i b) @trusted
504 {
505 }
506 unittest
507 {
508 }
509 */
510 
511 /*
512 /// Compare packed unsigned 32-bit integers in a and b, and store packed maximum values in dst.
513 __m128i _mm_max_epu32 (__m128i a, __m128i b) @trusted
514 {
515 }
516 unittest
517 {
518 }
519 */
520 
521 /*
522 /// Compare packed signed 32-bit integers in a and b, and store packed minimum values in dst.
523 __m128i _mm_min_epi32 (__m128i a, __m128i b) @trusted
524 {
525 }
526 unittest
527 {
528 }
529 */
530 
531 /*
532 /// Compare packed signed 8-bit integers in a and b, and store packed minimum values in dst.
533 __m128i _mm_min_epi8 (__m128i a, __m128i b) @trusted
534 {
535 }
536 unittest
537 {
538 }
539 */
540 
541 /*
542 /// Compare packed unsigned 16-bit integers in a and b, and store packed minimum values in dst.
543 __m128i _mm_min_epu16 (__m128i a, __m128i b) @trusted
544 {
545 }
546 unittest
547 {
548 }
549 */
550 
551 /*
552 /// Compare packed unsigned 32-bit integers in a and b, and store packed minimum values in dst.
553 __m128i _mm_min_epu32 (__m128i a, __m128i b) @trusted
554 {
555 }
556 unittest
557 {
558 }
559 */
560 
561 /*
562 /// Horizontally compute the minimum amongst the packed unsigned 16-bit integers in a, store the minimum and index in dst, and zero the remaining bits in dst.
563 __m128i _mm_minpos_epu16 (__m128i a) @trusted
564 {
565 }
566 unittest
567 {
568 }
569 */
570 
571 /*
572 /// Compute the sum of absolute differences (SADs) of quadruplets of unsigned 8-bit integers in a compared to those in b, and store the 16-bit results in dst. Eight SADs are performed using one quadruplet from b and eight quadruplets from a. One quadruplet is selected from b starting at on the offset specified in imm8. Eight quadruplets are formed from sequential 8-bit integers selected from a starting at the offset specified in imm8.
573 __m128i _mm_mpsadbw_epu8 (__m128i a, __m128i b, const int imm8) @trusted
574 {
575 }
576 unittest
577 {
578 }
579 */
580 
581 /*
582 /// Multiply the low signed 32-bit integers from each packed 64-bit element in a and b, and store the signed 64-bit results in dst.
583 __m128i _mm_mul_epi32 (__m128i a, __m128i b) @trusted
584 {
585 }
586 unittest
587 {
588 }
589 */
590 
591 /*
592 /// Multiply the packed 32-bit integers in a and b, producing intermediate 64-bit integers, and store the low 32 bits of the intermediate integers in dst.
593 __m128i _mm_mullo_epi32 (__m128i a, __m128i b) @trusted
594 {
595 }
596 unittest
597 {
598 }
599 */
600 
601 /*
602 /// Convert packed signed 32-bit integers from a and b to packed 16-bit integers using unsigned saturation, and store the results in dst.
603 __m128i _mm_packus_epi32 (__m128i a, __m128i b) @trusted
604 {
605 }
606 unittest
607 {
608 }
609 */
610 
611 /// Round the packed double-precision (64-bit) floating-point elements in a using the rounding parameter, and store the results as packed double-precision floating-point elements in dst.
612 /// Rounding is done according to the rounding[3:0] parameter, which can be one of:
613 ///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
614 ///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
615 ///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
616 ///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
617 /*
618 ///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
619 __m128d _mm_round_pd (__m128d a, int rounding) @trusted
620 {
621 }
622 unittest
623 {
624 }
625 */
626 
627 /// Round the packed single-precision (32-bit) floating-point elements in a using the rounding parameter, and store the results as packed single-precision floating-point elements in dst.
628 /// Rounding is done according to the rounding[3:0] parameter, which can be one of:
629 ///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
630 ///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
631 ///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
632 ///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
633 /*
634 ///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
635 __m128 _mm_round_ps (__m128 a, int rounding) @trusted
636 {
637 }
638 unittest
639 {
640 }
641 */
642 
643 /// Round the lower double-precision (64-bit) floating-point element in b using the rounding parameter, store the result as a double-precision floating-point element in the lower element of dst, and copy the upper element from a to the upper element of dst.
644 /// Rounding is done according to the rounding[3:0] parameter, which can be one of:
645 ///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
646 ///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
647 ///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
648 ///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
649 /*
650 ///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
651 __m128d _mm_round_sd (__m128d a, __m128d b, int rounding) @trusted
652 {
653 }
654 unittest
655 {
656 }
657 */
658 
659 /// Round the lower single-precision (32-bit) floating-point element in b using the rounding parameter, store the result as a single-precision floating-point element in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.
660 /// Rounding is done according to the rounding[3:0] parameter, which can be one of:
661 ///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
662 ///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
663 ///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
664 ///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
665 /*
666 ///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
667 __m128 _mm_round_ss (__m128 a, __m128 b, int rounding) @trusted
668 {
669 }
670 unittest
671 {
672 }
673 */
674 
675 /*
676 /// Load 128-bits of integer data from memory into dst using a non-temporal memory hint. mem_addr must be aligned on a 16-byte boundary or a general-protection exception may be generated.
677 __m128i _mm_stream_load_si128 (__m128i * mem_addr) @trusted
678 {
679 }
680 unittest
681 {
682 }
683 */
684 
685 /*
686 /// Compute the bitwise NOT of a and then AND with a 128-bit vector containing all 1's, and return 1 if the result is zero, otherwise return 0.
687 int _mm_test_all_ones (__m128i a) @trusted
688 {
689 }
690 unittest
691 {
692 }
693 */
694 
695 /*
696 /// Compute the bitwise AND of 128 bits (representing integer data) in a and mask, and return 1 if the result is zero, otherwise return 0.
697 int _mm_test_all_zeros (__m128i a, __m128i mask) @trusted
698 {
699 }
700 unittest
701 {
702 }
703 */
704 
705 /*
706 /// Compute the bitwise AND of 128 bits (representing integer data) in a and mask, and set ZF to 1 if the result is zero, otherwise set ZF to 0. Compute the bitwise NOT of a and then AND with mask, and set CF to 1 if the result is zero, otherwise set CF to 0. Return 1 if both the ZF and CF values are zero, otherwise return 0.
707 int _mm_test_mix_ones_zeros (__m128i a, __m128i mask) @trusted
708 {
709 }
710 unittest
711 {
712 }
713 */
714 
715 /*
716 /// Compute the bitwise AND of 128 bits (representing integer data) in a and b, and set ZF to 1 if the result is zero, otherwise set ZF to 0. Compute the bitwise NOT of a and then AND with b, and set CF to 1 if the result is zero, otherwise set CF to 0. Return the CF value.
717 int _mm_testc_si128 (__m128i a, __m128i b) @trusted
718 {
719 }
720 unittest
721 {
722 }
723 */
724 
725 /*
726 /// Compute the bitwise AND of 128 bits (representing integer data) in a and b, and set ZF to 1 if the result is zero, otherwise set ZF to 0. Compute the bitwise NOT of a and then AND with b, and set CF to 1 if the result is zero, otherwise set CF to 0. Return 1 if both the ZF and CF values are zero, otherwise return 0.
727 int _mm_testnzc_si128 (__m128i a, __m128i b) @trusted
728 {
729 }
730 unittest
731 {
732 }
733 */
734 
735 /*
736 /// Compute the bitwise AND of 128 bits (representing integer data) in a and b, and set ZF to 1 if the result is zero, otherwise set ZF to 0. Compute the bitwise NOT of a and then AND with b, and set CF to 1 if the result is zero, otherwise set CF to 0. Return the ZF value.
737 int _mm_testz_si128 (__m128i a, __m128i b) @trusted
738 {
739 }
740 unittest
741 {
742 }
743 */
744 
745 
746 // LDC intrinsics present from 1.0.0 to 
747 
748 /*
749 
750 pragma(LDC_intrinsic, "llvm.x86.sse41.blendvpd")
751     double2 __builtin_ia32_blendvpd(double2, double2, double2) pure @safe;
752 
753 pragma(LDC_intrinsic, "llvm.x86.sse41.blendvps")
754     float4 __builtin_ia32_blendvps(float4, float4, float4) pure @safe;
755 
756 pragma(LDC_intrinsic, "llvm.x86.sse41.dppd")
757     double2 __builtin_ia32_dppd(double2, double2, byte) pure @safe;
758 
759 pragma(LDC_intrinsic, "llvm.x86.sse41.dpps")
760     float4 __builtin_ia32_dpps(float4, float4, byte) pure @safe;
761 
762 pragma(LDC_intrinsic, "llvm.x86.sse41.insertps")
763     float4 __builtin_ia32_insertps128(float4, float4, byte) pure @safe;
764 
765 pragma(LDC_intrinsic, "llvm.x86.sse41.mpsadbw")
766     short8 __builtin_ia32_mpsadbw128(byte16, byte16, byte) pure @safe;
767 
768 pragma(LDC_intrinsic, "llvm.x86.sse41.packusdw")
769     short8 __builtin_ia32_packusdw128(int4, int4) pure @safe;
770 
771 pragma(LDC_intrinsic, "llvm.x86.sse41.pblendvb")
772     byte16 __builtin_ia32_pblendvb128(byte16, byte16, byte16) pure @safe;
773 
774 pragma(LDC_intrinsic, "llvm.x86.sse41.phminposuw")
775     short8 __builtin_ia32_phminposuw128(short8) pure @safe;
776 
777 
778 pragma(LDC_intrinsic, "llvm.x86.sse41.ptestc")
779     int __builtin_ia32_ptestc128(long2, long2) pure @safe;
780 
781 pragma(LDC_intrinsic, "llvm.x86.sse41.ptestnzc")
782     int __builtin_ia32_ptestnzc128(long2, long2) pure @safe;
783 
784 pragma(LDC_intrinsic, "llvm.x86.sse41.ptestz")
785     int __builtin_ia32_ptestz128(long2, long2) pure @safe;
786 
787 pragma(LDC_intrinsic, "llvm.x86.sse41.round.pd")
788     double2 __builtin_ia32_roundpd(double2, int) pure @safe;
789 
790 pragma(LDC_intrinsic, "llvm.x86.sse41.round.ps")
791     float4 __builtin_ia32_roundps(float4, int) pure @safe;
792 
793 pragma(LDC_intrinsic, "llvm.x86.sse41.round.sd")
794     double2 __builtin_ia32_roundsd(double2, double2, int) pure @safe;
795 
796 pragma(LDC_intrinsic, "llvm.x86.sse41.round.ss")
797     float4 __builtin_ia32_roundss(float4, float4, int) pure @safe;
798 
799     */