1 /**
2 * SSE3 intrinsics.
3 * https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#techs=SSE3
4 *
5 * Copyright: Guillaume Piolat 2016-2020.
6 *            Charles Gregory 2019.
7 * License:   $(LINK2 http://www.boost.org/LICENSE_1_0.txt, Boost License 1.0)
8 */
9 module inteli.pmmintrin;
10 
11 public import inteli.types;
12 import inteli.internals;
13 public import inteli.emmintrin;
14 
15 
16 // Note: this header will work whether you have SSE3 enabled or not.
17 // With LDC, use "dflags-ldc": ["-mattr=+sse3"] or equivalent to actively 
18 // generate SSE3 instruction (they are often enabled with -O1 or greater).
19 
20 
21 nothrow @nogc:
22 
23 /// Alternatively add and subtract packed double-precision (64-bit) 
24 /// floating-point elements in `a` to/from packed elements in `b`.
25 __m128d _mm_addsub_pd (__m128d a, __m128d b) pure @trusted
26 {
27     // Note: generates addsubpd since LDC 1.3.0 with -O1
28     // PERF: for GDC, detect SSE3 and use the relevant builtin, because it doesn't generates addsubpd
29     // ARM: well optimized starting with LDC 1.18.0 -O2
30     a.ptr[0] = a.array[0] - b.array[0];
31     a.ptr[1] = a.array[1] + b.array[1];
32     return a;
33 }
34 unittest
35 {
36     auto v1 =_mm_setr_pd(1.0,2.0);
37     auto v2 =_mm_setr_pd(1.0,2.0);
38     assert(_mm_addsub_pd(v1,v2).array == _mm_setr_pd(0.0,4.0).array);
39 }
40 
41 /// Alternatively add and subtract packed single-precision (32-bit) 
42 /// floating-point elements in `a` to/from packed elements in `b`.
43 float4 _mm_addsub_ps (float4 a, float4 b) pure @trusted
44 {
45     // Note: generates addsubps since LDC 1.3.0 with -O1
46     // PERF: for GDC, detect SSE3 and use the relevant builtin
47     a.ptr[0] -= b.array[0];
48     a.ptr[1] += b.array[1];
49     a.ptr[2] -= b.array[2];
50     a.ptr[3] += b.array[3];
51     return a;
52 }
53 unittest
54 {
55     auto v1 =_mm_setr_ps(1.0f, 2.0f, 3.0f, 4.0f);
56     auto v2 =_mm_setr_ps(1.0f, 2.0f, 3.0f, 4.0f);
57     assert( _mm_addsub_ps(v1,v2).array == _mm_setr_ps(0.0f, 4.0f, 0.0f, 8.0f).array );
58 }
59 
60 
61 /// Horizontally add adjacent pairs of double-precision (64-bit) 
62 /// floating-point elements in `a` and `b`.
63 __m128d _mm_hadd_pd (__m128d a, __m128d b) pure @trusted
64 {
65     static if (LDC_with_SSE3)
66     {
67         return __builtin_ia32_haddpd(a, b);
68     }
69     else
70     {
71        // On GDC this generates haddpd with -O1
72         __m128d res;
73         res.ptr[0] = a.array[1] + a.array[0];
74         res.ptr[1] = b.array[1] + b.array[0];
75         return res;
76     }
77 }
78 unittest
79 {
80     auto A =_mm_setr_pd(1.5, 2.0);
81     auto B =_mm_setr_pd(1.0, 2.0);
82     assert( _mm_hadd_pd(A, B).array ==_mm_setr_pd(3.5, 3.0).array );
83 }
84 
85 // PERF: for GDC, detect SSE3 and use the relevant builtin
86 /// Horizontally add adjacent pairs of single-precision (32-bit) 
87 /// floating-point elements in `a` and `b`.
88 __m128 _mm_hadd_ps (__m128 a, __m128 b) pure @trusted
89 {
90     static if (LDC_with_SSE3)
91     {
92         return __builtin_ia32_haddps(a, b);
93     }
94     else static if (LDC_with_ARM64)
95     {
96         return vpaddq_f32(a, b);
97     }
98     else
99     {    
100         __m128 res;
101         res.ptr[0] = a.array[1] + a.array[0];
102         res.ptr[1] = a.array[3] + a.array[2];
103         res.ptr[2] = b.array[1] + b.array[0];
104         res.ptr[3] = b.array[3] + b.array[2];
105         return res;
106     }
107 }
108 unittest
109 {
110     __m128 A =_mm_setr_ps(1.0f, 2.0f, 3.0f, 5.0f);
111     __m128 B =_mm_setr_ps(1.5f, 2.0f, 3.5f, 4.0f);
112     assert( _mm_hadd_ps(A, B).array == _mm_setr_ps(3.0f, 8.0f, 3.5f, 7.5f).array );
113 }
114 
115 /// Horizontally subtract adjacent pairs of double-precision (64-bit) 
116 /// floating-point elements in `a` and `b`.
117 __m128d _mm_hsub_pd (__m128d a, __m128d b) pure @trusted
118 {
119     static if (LDC_with_SSE3)
120     {
121         return __builtin_ia32_hsubpd(a, b);
122     }
123     else
124     {        
125         // On GDC this generates hsubpd with -O1
126         __m128d res;
127         res.ptr[0] = a.array[0] - a.array[1];
128         res.ptr[1] = b.array[0] - b.array[1];
129         return res;
130     }
131 }
132 unittest
133 {
134     auto A =_mm_setr_pd(1.5, 2.0);
135     auto B =_mm_setr_pd(1.0, 2.0);
136     assert( _mm_hsub_pd(A, B).array ==_mm_setr_pd(-0.5, -1.0).array );
137 }
138 
139 /// Horizontally subtract adjacent pairs of single-precision (32-bit) 
140 /// floating-point elements in `a` and `b`.
141 __m128 _mm_hsub_ps (__m128 a, __m128 b) pure @trusted
142 {
143     static if (LDC_with_SSE3)
144     {
145         return __builtin_ia32_hsubps(a, b);
146     }
147     else static if (LDC_with_ARM64)
148     {
149         int4 mask = [0, 0x80000000, 0, 0x80000000];
150         a = cast(__m128)(cast(int4)a ^ mask);
151         b = cast(__m128)(cast(int4)b ^ mask);
152         return vpaddq_f32(a, b);
153     }
154     else
155     {
156         // PERF: GDC doesn't generate the right instruction, do something
157         __m128 res;
158         res.ptr[0] = a.array[0] - a.array[1];
159         res.ptr[1] = a.array[2] - a.array[3];
160         res.ptr[2] = b.array[0] - b.array[1];
161         res.ptr[3] = b.array[2] - b.array[3];
162         return res;
163     }
164 }
165 unittest
166 {
167     __m128 A =_mm_setr_ps(1.0f, 2.0f, 3.0f, 5.0f);
168     __m128 B =_mm_setr_ps(1.5f, 2.0f, 3.5f, 4.0f);
169     assert(_mm_hsub_ps(A, B).array == _mm_setr_ps(-1.0f, -2.0f, -0.5f, -0.5f).array);
170 }
171 
172 /// Load 128-bits of integer data from unaligned memory.
173 // Note: The saying is LDDQU was only ever useful around 2008
174 // See_also: https://stackoverflow.com/questions/38370622/a-faster-integer-sse-unalligned-load-thats-rarely-used
175 alias _mm_lddqu_si128 = _mm_loadu_si128;
176 
177 /// Load a double-precision (64-bit) floating-point element from memory into both elements of result.
178 __m128d _mm_loaddup_pd (const(double)* mem_addr) pure @trusted
179 {
180     // Note: generates movddup since LDC 1.3 with -O1 -mattr=+sse3
181     // Same for GDC with -O1
182     double value = *mem_addr;
183     __m128d res;
184     res.ptr[0] = value;
185     res.ptr[1] = value;
186     return res;
187 }
188 unittest
189 {
190     version(LDC)
191     {
192         double a = 7.5;
193         assert(_mm_loaddup_pd(&a) == _mm_set_pd(7.5, 7.5));
194     }
195     else
196     {
197         double a = 7.5;
198         // For some reason, this line used to break with LDC, but not when isolated! Was never reported.
199         assert(_mm_loaddup_pd(&a).array == _mm_set_pd(7.5, 7.5).array);
200     }
201 }
202 
203 /// Duplicate the low double-precision (64-bit) floating-point element from `a`.
204 __m128d _mm_movedup_pd (__m128d a) pure @trusted
205 {
206     // Note: generates movddup since LDC 1.3 with -O1 -mattr=+sse3
207     // Something efficient with -01 for GDC
208     a.ptr[1] = a.array[0];
209     return a;
210 }
211 unittest
212 {
213     __m128d A = _mm_setr_pd(7.0, 2.5);
214     assert(_mm_movedup_pd(A).array == _mm_set_pd(7.0, 7.0).array);
215 }
216 
217 /// Duplicate odd-indexed single-precision (32-bit) floating-point elements from `a`.
218 __m128 _mm_movehdup_ps (__m128 a) pure @trusted
219 {
220     // Generates movshdup since LDC 1.3 with -O1 -mattr=+sse3
221     // PERF but GDC never generates it
222     a.ptr[0] = a.array[1];
223     a.ptr[2] = a.array[3];
224     return a;
225 }
226 unittest
227 {
228     __m128 A = _mm_movehdup_ps(_mm_setr_ps(1, 2, 3, 4));
229     float[4] correct = [2.0f, 2, 4, 4 ];
230     assert(A.array == correct);
231 }
232 
233 /// Duplicate even-indexed single-precision (32-bit) floating-point elements from `a`.
234 __m128 _mm_moveldup_ps (__m128 a) pure @trusted
235 {
236     // Generates movsldup since LDC 1.3 with -O1 -mattr=+sse3
237     // PERF but GDC never generates it
238     a.ptr[1] = a.array[0];
239     a.ptr[3] = a.array[2];
240     return a;
241 }
242 unittest
243 {
244     __m128 A = _mm_moveldup_ps(_mm_setr_ps(1, 2, 3, 4));
245     float[4] correct = [1.0f, 1, 3, 3 ];
246     assert(A.array == correct);
247 }