1 /**
2 * SSE4.1 intrinsics.
3 *
4 * Copyright: Guillaume Piolat 2021.
5 *            Johan Engelen 2021.
6 * License:   $(LINK2 http://www.boost.org/LICENSE_1_0.txt, Boost License 1.0)
7 */
8 module inteli.smmintrin;
9 
10 // SSE4.1 instructions
11 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#techs=SSE4_1
12 // Note: this header will work whether you have SSE4.1 enabled or not.
13 // With LDC, use "dflags-ldc": ["-mattr=+sse4.1"] or equivalent to actively
14 // generate SSE4.1 instructions.
15 
16 public import inteli.types;
17 import inteli.internals;
18 
19 // smmintrin pulls in all previous instruction set intrinsics.
20 public import inteli.tmmintrin;
21 
22 nothrow @nogc:
23 
24 enum int _MM_FROUND_TO_NEAREST_INT = 0x00; /// SSE4.1 rounding modes
25 enum int _MM_FROUND_TO_NEG_INF     = 0x01; /// ditto
26 enum int _MM_FROUND_TO_POS_INF     = 0x02; /// ditto
27 enum int _MM_FROUND_TO_ZERO        = 0x03; /// ditto
28 enum int _MM_FROUND_CUR_DIRECTION  = 0x04; /// ditto
29 enum int _MM_FROUND_RAISE_EXC      = 0x00; /// ditto
30 enum int _MM_FROUND_NO_EXC         = 0x08; /// ditto
31 
32 enum int _MM_FROUND_NINT      = (_MM_FROUND_RAISE_EXC | _MM_FROUND_TO_NEAREST_INT);
33 enum int _MM_FROUND_FLOOR     = (_MM_FROUND_RAISE_EXC | _MM_FROUND_TO_NEG_INF);
34 enum int _MM_FROUND_CEIL      = (_MM_FROUND_RAISE_EXC | _MM_FROUND_TO_POS_INF);
35 enum int _MM_FROUND_TRUNC     = (_MM_FROUND_RAISE_EXC | _MM_FROUND_TO_ZERO);
36 enum int _MM_FROUND_RINT      = (_MM_FROUND_RAISE_EXC | _MM_FROUND_CUR_DIRECTION);
37 enum int _MM_FROUND_NEARBYINT = (_MM_FROUND_NO_EXC    | _MM_FROUND_CUR_DIRECTION);
38 
39 /*
40 /// Blend packed 16-bit integers from a and b using control mask imm8, and store the results in dst.
41 __m128i _mm_blend_epi16 (__m128i a, __m128i b, const int imm8) @trusted
42 {
43 }
44 unittest
45 {
46 }
47 */
48 
49 /*
50 /// Blend packed double-precision (64-bit) floating-point elements from a and b using control mask imm8, and store the results in dst.
51 __m128d _mm_blend_pd (__m128d a, __m128d b, const int imm8) @trusted
52 {
53 }
54 unittest
55 {
56 }
57 */
58 
59 /*
60 /// Blend packed single-precision (32-bit) floating-point elements from a and b using control mask imm8, and store the results in dst.
61 __m128 _mm_blend_ps (__m128 a, __m128 b, const int imm8) @trusted
62 {
63 }
64 unittest
65 {
66 }
67 */
68 
69 /*
70 /// Blend packed 8-bit integers from a and b using mask, and store the results in dst.
71 __m128i _mm_blendv_epi8 (__m128i a, __m128i b, __m128i mask) @trusted
72 {
73 }
74 unittest
75 {
76 }
77 */
78 
79 /*
80 /// Blend packed double-precision (64-bit) floating-point elements from a and b using mask, and store the results in dst.
81 __m128d _mm_blendv_pd (__m128d a, __m128d b, __m128d mask) @trusted
82 {
83 }
84 unittest
85 {
86 }
87 */
88 
89 /*
90 /// Blend packed single-precision (32-bit) floating-point elements from a and b using mask, and store the results in dst.
91 __m128 _mm_blendv_ps (__m128 a, __m128 b, __m128 mask) @trusted
92 {
93 }
94 unittest
95 {
96 }
97 */
98 
99 /*
100 /// Round the packed double-precision (64-bit) floating-point elements in a up to an integer value, and store the results as packed double-precision floating-point elements in dst.
101 __m128d _mm_ceil_pd (__m128d a) @trusted
102 {
103 }
104 unittest
105 {
106 }
107 */
108 
109 /*
110 /// Round the packed single-precision (32-bit) floating-point elements in a up to an integer value, and store the results as packed single-precision floating-point elements in dst.
111 __m128 _mm_ceil_ps (__m128 a) @trusted
112 {
113 }
114 unittest
115 {
116 }
117 */
118 
119 /*
120 /// Round the lower double-precision (64-bit) floating-point element in b up to an integer value, store the result as a double-precision floating-point element in the lower element of dst, and copy the upper element from a to the upper element of dst.
121 __m128d _mm_ceil_sd (__m128d a, __m128d b) @trusted
122 {
123 }
124 unittest
125 {
126 }
127 */
128 
129 /*
130 /// Round the lower single-precision (32-bit) floating-point element in b up to an integer value, store the result as a single-precision floating-point element in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.
131 __m128 _mm_ceil_ss (__m128 a, __m128 b) @trusted
132 {
133 }
134 unittest
135 {
136 }
137 */
138 
139 /*
140 /// Compare packed 64-bit integers in a and b for equality, and store the results in dst.
141 __m128i _mm_cmpeq_epi64 (__m128i a, __m128i b) @trusted
142 {
143 }
144 unittest
145 {
146 }
147 */
148 
149 /*
150 /// Sign extend packed 16-bit integers in a to packed 32-bit integers, and store the results in dst.
151 __m128i _mm_cvtepi16_epi32 (__m128i a) @trusted
152 {
153 }
154 unittest
155 {
156 }
157 */
158 
159 /*
160 /// Sign extend packed 16-bit integers in a to packed 64-bit integers, and store the results in dst.
161 __m128i _mm_cvtepi16_epi64 (__m128i a) @trusted
162 {
163 }
164 unittest
165 {
166 }
167 */
168 
169 /*
170 /// Sign extend packed 32-bit integers in a to packed 64-bit integers, and store the results in dst.
171 __m128i _mm_cvtepi32_epi64 (__m128i a) @trusted
172 {
173 }
174 unittest
175 {
176 }
177 */
178 
179 /*
180 /// Sign extend packed 8-bit integers in a to packed 16-bit integers, and store the results in dst.
181 __m128i _mm_cvtepi8_epi16 (__m128i a) @trusted
182 {
183 }
184 unittest
185 {
186 }
187 */
188 
189 /*
190 /// Sign extend packed 8-bit integers in a to packed 32-bit integers, and store the results in dst.
191 __m128i _mm_cvtepi8_epi32 (__m128i a) @trusted
192 {
193 }
194 unittest
195 {
196 }
197 */
198 
199 /*
200 /// Sign extend packed 8-bit integers in the low 8 bytes of a to packed 64-bit integers, and store the results in dst.
201 __m128i _mm_cvtepi8_epi64 (__m128i a) @trusted
202 {
203 }
204 unittest
205 {
206 }
207 */
208 
209 /*
210 /// Zero extend packed unsigned 16-bit integers in a to packed 32-bit integers, and store the results in dst.
211 __m128i _mm_cvtepu16_epi32 (__m128i a) @trusted
212 {
213 }
214 unittest
215 {
216 }
217 */
218 
219 /*
220 /// Zero extend packed unsigned 16-bit integers in a to packed 64-bit integers, and store the results in dst.
221 __m128i _mm_cvtepu16_epi64 (__m128i a) @trusted
222 {
223 }
224 unittest
225 {
226 }
227 */
228 
229 /*
230 /// Zero extend packed unsigned 32-bit integers in a to packed 64-bit integers, and store the results in dst.
231 __m128i _mm_cvtepu32_epi64 (__m128i a) @trusted
232 {
233 }
234 unittest
235 {
236 }
237 */
238 
239 /*
240 /// Zero extend packed unsigned 8-bit integers in a to packed 16-bit integers, and store the results in dst.
241 __m128i _mm_cvtepu8_epi16 (__m128i a) @trusted
242 {
243 }
244 unittest
245 {
246 }
247 */
248 
249 /*
250 /// Zero extend packed unsigned 8-bit integers in a to packed 32-bit integers, and store the results in dst.
251 __m128i _mm_cvtepu8_epi32 (__m128i a) @trusted
252 {
253 }
254 unittest
255 {
256 }
257 */
258 
259 /*
260 /// Zero extend packed unsigned 8-bit integers in the low 8 byte sof a to packed 64-bit integers, and store the results in dst.
261 __m128i _mm_cvtepu8_epi64 (__m128i a) @trusted
262 {
263 }
264 unittest
265 {
266 }
267 */
268 
269 /*
270 /// Conditionally multiply the packed double-precision (64-bit) floating-point elements in a and b using the high 4 bits in imm8, sum the four products, and conditionally store the sum in dst using the low 4 bits of imm8.
271 __m128d _mm_dp_pd (__m128d a, __m128d b, const int imm8) @trusted
272 {
273 }
274 unittest
275 {
276 }
277 */
278 
279 /*
280 /// Conditionally multiply the packed single-precision (32-bit) floating-point elements in a and b using the high 4 bits in imm8, sum the four products, and conditionally store the sum in dst using the low 4 bits of imm8.
281 __m128 _mm_dp_ps (__m128 a, __m128 b, const int imm8) @trusted
282 {
283 }
284 unittest
285 {
286 }
287 */
288 
289 /*
290 /// Extract a 32-bit integer from a, selected with imm8, and store the result in dst.
291 int _mm_extract_epi32 (__m128i a, const int imm8) @trusted
292 {
293 }
294 unittest
295 {
296 }
297 */
298 
299 /*
300 /// Extract a 64-bit integer from a, selected with imm8, and store the result in dst.
301 __int64 _mm_extract_epi64 (__m128i a, const int imm8) @trusted
302 {
303 }
304 unittest
305 {
306 }
307 */
308 
309 /*
310 /// Extract an 8-bit integer from a, selected with imm8, and store the result in the lower element of dst.
311 int _mm_extract_epi8 (__m128i a, const int imm8) @trusted
312 {
313 }
314 unittest
315 {
316 }
317 */
318 
319 /*
320 /// Extract a single-precision (32-bit) floating-point element from a, selected with imm8, and store the result in dst.
321 int _mm_extract_ps (__m128 a, const int imm8) @trusted
322 {
323 }
324 unittest
325 {
326 }
327 */
328 
329 /*
330 /// Round the packed double-precision (64-bit) floating-point elements in a down to an integer value, and store the results as packed double-precision floating-point elements in dst.
331 __m128d _mm_floor_pd (__m128d a) @trusted
332 {
333 }
334 unittest
335 {
336 }
337 */
338 
339 /*
340 /// Round the packed single-precision (32-bit) floating-point elements in a down to an integer value, and store the results as packed single-precision floating-point elements in dst.
341 __m128 _mm_floor_ps (__m128 a) @trusted
342 {
343 }
344 unittest
345 {
346 }
347 */
348 
349 /*
350 /// Round the lower double-precision (64-bit) floating-point element in b down to an integer value, store the result as a double-precision floating-point element in the lower element of dst, and copy the upper element from a to the upper element of dst.
351 __m128d _mm_floor_sd (__m128d a, __m128d b) @trusted
352 {
353 }
354 unittest
355 {
356 }
357 */
358 
359 /*
360 /// Round the lower single-precision (32-bit) floating-point element in b down to an integer value, store the result as a single-precision floating-point element in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.
361 __m128 _mm_floor_ss (__m128 a, __m128 b) @trusted
362 {
363 }
364 unittest
365 {
366 }
367 */
368 
369 /*
370 /// Copy a to dst, and insert the 32-bit integer i into dst at the location specified by imm8.
371 __m128i _mm_insert_epi32 (__m128i a, int i, const int imm8) @trusted
372 {
373 }
374 unittest
375 {
376 }
377 */
378 
379 /*
380 /// Copy a to dst, and insert the 64-bit integer i into dst at the location specified by imm8.
381 __m128i _mm_insert_epi64 (__m128i a, __int64 i, const int imm8) @trusted
382 {
383 }
384 unittest
385 {
386 }
387 */
388 
389 /*
390 /// Copy a to dst, and insert the lower 8-bit integer from i into dst at the location specified by imm8.
391 __m128i _mm_insert_epi8 (__m128i a, int i, const int imm8) @trusted
392 {
393 }
394 unittest
395 {
396 }
397 */
398 
399 /*
400 /// Copy a to tmp, then insert a single-precision (32-bit) floating-point element from b into tmp using the control in imm8. Store tmp to dst using the mask in imm8 (elements are zeroed out when the corresponding bit is set).
401 __m128 _mm_insert_ps (__m128 a, __m128 b, const int imm8) @trusted
402 {
403 }
404 unittest
405 {
406 }
407 */
408 
409 /*
410 /// Compare packed signed 32-bit integers in a and b, and store packed maximum values in dst.
411 __m128i _mm_max_epi32 (__m128i a, __m128i b) @trusted
412 {
413 }
414 unittest
415 {
416 }
417 */
418 
419 /*
420 /// Compare packed signed 8-bit integers in a and b, and store packed maximum values in dst.
421 __m128i _mm_max_epi8 (__m128i a, __m128i b) @trusted
422 {
423 }
424 unittest
425 {
426 }
427 */
428 
429 /*
430 /// Compare packed unsigned 16-bit integers in a and b, and store packed maximum values in dst.
431 __m128i _mm_max_epu16 (__m128i a, __m128i b) @trusted
432 {
433 }
434 unittest
435 {
436 }
437 */
438 
439 /*
440 /// Compare packed unsigned 32-bit integers in a and b, and store packed maximum values in dst.
441 __m128i _mm_max_epu32 (__m128i a, __m128i b) @trusted
442 {
443 }
444 unittest
445 {
446 }
447 */
448 
449 /*
450 /// Compare packed signed 32-bit integers in a and b, and store packed minimum values in dst.
451 __m128i _mm_min_epi32 (__m128i a, __m128i b) @trusted
452 {
453 }
454 unittest
455 {
456 }
457 */
458 
459 /*
460 /// Compare packed signed 8-bit integers in a and b, and store packed minimum values in dst.
461 __m128i _mm_min_epi8 (__m128i a, __m128i b) @trusted
462 {
463 }
464 unittest
465 {
466 }
467 */
468 
469 /*
470 /// Compare packed unsigned 16-bit integers in a and b, and store packed minimum values in dst.
471 __m128i _mm_min_epu16 (__m128i a, __m128i b) @trusted
472 {
473 }
474 unittest
475 {
476 }
477 */
478 
479 /*
480 /// Compare packed unsigned 32-bit integers in a and b, and store packed minimum values in dst.
481 __m128i _mm_min_epu32 (__m128i a, __m128i b) @trusted
482 {
483 }
484 unittest
485 {
486 }
487 */
488 
489 /*
490 /// Horizontally compute the minimum amongst the packed unsigned 16-bit integers in a, store the minimum and index in dst, and zero the remaining bits in dst.
491 __m128i _mm_minpos_epu16 (__m128i a) @trusted
492 {
493 }
494 unittest
495 {
496 }
497 */
498 
499 /*
500 /// Compute the sum of absolute differences (SADs) of quadruplets of unsigned 8-bit integers in a compared to those in b, and store the 16-bit results in dst. Eight SADs are performed using one quadruplet from b and eight quadruplets from a. One quadruplet is selected from b starting at on the offset specified in imm8. Eight quadruplets are formed from sequential 8-bit integers selected from a starting at the offset specified in imm8.
501 __m128i _mm_mpsadbw_epu8 (__m128i a, __m128i b, const int imm8) @trusted
502 {
503 }
504 unittest
505 {
506 }
507 */
508 
509 /*
510 /// Multiply the low signed 32-bit integers from each packed 64-bit element in a and b, and store the signed 64-bit results in dst.
511 __m128i _mm_mul_epi32 (__m128i a, __m128i b) @trusted
512 {
513 }
514 unittest
515 {
516 }
517 */
518 
519 /*
520 /// Multiply the packed 32-bit integers in a and b, producing intermediate 64-bit integers, and store the low 32 bits of the intermediate integers in dst.
521 __m128i _mm_mullo_epi32 (__m128i a, __m128i b) @trusted
522 {
523 }
524 unittest
525 {
526 }
527 */
528 
529 /*
530 /// Convert packed signed 32-bit integers from a and b to packed 16-bit integers using unsigned saturation, and store the results in dst.
531 __m128i _mm_packus_epi32 (__m128i a, __m128i b) @trusted
532 {
533 }
534 unittest
535 {
536 }
537 */
538 
539 /// Round the packed double-precision (64-bit) floating-point elements in a using the rounding parameter, and store the results as packed double-precision floating-point elements in dst.
540 /// Rounding is done according to the rounding[3:0] parameter, which can be one of:
541 ///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
542 ///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
543 ///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
544 ///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
545 /*
546 ///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
547 __m128d _mm_round_pd (__m128d a, int rounding) @trusted
548 {
549 }
550 unittest
551 {
552 }
553 */
554 
555 /// Round the packed single-precision (32-bit) floating-point elements in a using the rounding parameter, and store the results as packed single-precision floating-point elements in dst.
556 /// Rounding is done according to the rounding[3:0] parameter, which can be one of:
557 ///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
558 ///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
559 ///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
560 ///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
561 /*
562 ///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
563 __m128 _mm_round_ps (__m128 a, int rounding) @trusted
564 {
565 }
566 unittest
567 {
568 }
569 */
570 
571 /// Round the lower double-precision (64-bit) floating-point element in b using the rounding parameter, store the result as a double-precision floating-point element in the lower element of dst, and copy the upper element from a to the upper element of dst.
572 /// Rounding is done according to the rounding[3:0] parameter, which can be one of:
573 ///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
574 ///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
575 ///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
576 ///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
577 /*
578 ///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
579 __m128d _mm_round_sd (__m128d a, __m128d b, int rounding) @trusted
580 {
581 }
582 unittest
583 {
584 }
585 */
586 
587 /// Round the lower single-precision (32-bit) floating-point element in b using the rounding parameter, store the result as a single-precision floating-point element in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.
588 /// Rounding is done according to the rounding[3:0] parameter, which can be one of:
589 ///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
590 ///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
591 ///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
592 ///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
593 /*
594 ///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
595 __m128 _mm_round_ss (__m128 a, __m128 b, int rounding) @trusted
596 {
597 }
598 unittest
599 {
600 }
601 */
602 
603 /*
604 /// Load 128-bits of integer data from memory into dst using a non-temporal memory hint. mem_addr must be aligned on a 16-byte boundary or a general-protection exception may be generated.
605 __m128i _mm_stream_load_si128 (__m128i * mem_addr) @trusted
606 {
607 }
608 unittest
609 {
610 }
611 */
612 
613 /*
614 /// Compute the bitwise NOT of a and then AND with a 128-bit vector containing all 1's, and return 1 if the result is zero, otherwise return 0.
615 int _mm_test_all_ones (__m128i a) @trusted
616 {
617 }
618 unittest
619 {
620 }
621 */
622 
623 /*
624 /// Compute the bitwise AND of 128 bits (representing integer data) in a and mask, and return 1 if the result is zero, otherwise return 0.
625 int _mm_test_all_zeros (__m128i a, __m128i mask) @trusted
626 {
627 }
628 unittest
629 {
630 }
631 */
632 
633 /*
634 /// Compute the bitwise AND of 128 bits (representing integer data) in a and mask, and set ZF to 1 if the result is zero, otherwise set ZF to 0. Compute the bitwise NOT of a and then AND with mask, and set CF to 1 if the result is zero, otherwise set CF to 0. Return 1 if both the ZF and CF values are zero, otherwise return 0.
635 int _mm_test_mix_ones_zeros (__m128i a, __m128i mask) @trusted
636 {
637 }
638 unittest
639 {
640 }
641 */
642 
643 /*
644 /// Compute the bitwise AND of 128 bits (representing integer data) in a and b, and set ZF to 1 if the result is zero, otherwise set ZF to 0. Compute the bitwise NOT of a and then AND with b, and set CF to 1 if the result is zero, otherwise set CF to 0. Return the CF value.
645 int _mm_testc_si128 (__m128i a, __m128i b) @trusted
646 {
647 }
648 unittest
649 {
650 }
651 */
652 
653 /*
654 /// Compute the bitwise AND of 128 bits (representing integer data) in a and b, and set ZF to 1 if the result is zero, otherwise set ZF to 0. Compute the bitwise NOT of a and then AND with b, and set CF to 1 if the result is zero, otherwise set CF to 0. Return 1 if both the ZF and CF values are zero, otherwise return 0.
655 int _mm_testnzc_si128 (__m128i a, __m128i b) @trusted
656 {
657 }
658 unittest
659 {
660 }
661 */
662 
663 /*
664 /// Compute the bitwise AND of 128 bits (representing integer data) in a and b, and set ZF to 1 if the result is zero, otherwise set ZF to 0. Compute the bitwise NOT of a and then AND with b, and set CF to 1 if the result is zero, otherwise set CF to 0. Return the ZF value.
665 int _mm_testz_si128 (__m128i a, __m128i b) @trusted
666 {
667 }
668 unittest
669 {
670 }
671 */