1 /**
2 * Copyright: Guillaume Piolat 2016-2019.
3 *            Charles Gregory 2019.
4 * License:   $(LINK2 http://www.boost.org/LICENSE_1_0.txt, Boost License 1.0)
5 * Authors:   Guillaume Piolat
6 */
7 module inteli.pmmintrin;
8 
9 public import inteli.types;
10 import inteli.internals;
11 import inteli.emmintrin;
12 
13 
14 // Note: this header will work whether you have SSE3 enabled or not.
15 // With LDC, use "dflags-ldc": ["-mattr=+sse3"] or equivalent to actively 
16 // generate SSE3 instruction (they are often enabled with -O1 or greater).
17 
18 
19 nothrow @nogc:
20 
21 /// Alternatively add and subtract packed double-precision (64-bit) 
22 /// floating-point elements in `a` to/from packed elements in `b`.
23 __m128d _mm_addsub_pd (__m128d a, __m128d b) pure @trusted
24 {
25     // Note: generates addsubpd since LDC 1.3.0 with -O1
26     // PERF: for GDC, detect SSE3 and use the relevant builtin, because it doesn't generates addsubpd
27     a.ptr[0] = a.array[0] - b.array[0];
28     a.ptr[1] = a.array[1] + b.array[1];
29     return a;
30 }
31 unittest
32 {
33     auto v1 =_mm_setr_pd(1.0,2.0);
34     auto v2 =_mm_setr_pd(1.0,2.0);
35     assert(_mm_addsub_pd(v1,v2).array == _mm_setr_pd(0.0,4.0).array);
36 }
37 
38 /// Alternatively add and subtract packed single-precision (32-bit) 
39 /// floating-point elements in `a` to/from packed elements in `b`.
40 float4 _mm_addsub_ps (float4 a, float4 b) pure @trusted
41 {
42     // Note: generates addsubps since LDC 1.3.0 with -O1
43     // PERF: for GDC, detect SSE3 and use the relevant builtin
44     a.ptr[0] -= b.array[0];
45     a.ptr[1] += b.array[1];
46     a.ptr[2] -= b.array[2];
47     a.ptr[3] += b.array[3];
48     return a;
49 }
50 unittest
51 {
52     auto v1 =_mm_setr_ps(1.0f, 2.0f, 3.0f, 4.0f);
53     auto v2 =_mm_setr_ps(1.0f, 2.0f, 3.0f, 4.0f);
54     assert( _mm_addsub_ps(v1,v2).array == _mm_setr_ps(0.0f, 4.0f, 0.0f, 8.0f).array );
55 }
56 
57 version(LDC)
58 {
59     /// Horizontally add adjacent pairs of double-precision (64-bit) 
60     /// floating-point elements in `a` and `b`.
61     __m128d _mm_hadd_pd (__m128d a, __m128d b) pure @safe
62     {
63         static if (__traits(targetHasFeature, "sse3"))
64         {
65             return __builtin_ia32_haddpd(a, b);
66         }
67         else
68         {
69             __m128d res;
70             res[0] = a[1] + a[0];
71             res[1] = b[1] + b[0];
72             return res;
73         }
74     }
75 }
76 else
77 {
78     /// Horizontally add adjacent pairs of double-precision (64-bit) 
79     /// floating-point elements in `a` and `b`.
80     __m128d _mm_hadd_pd (__m128d a, __m128d b) pure @trusted
81     {
82         // On GDC this generates haddpd with -O1
83         __m128d res;
84         res.ptr[0] = a.array[1] + a.array[0];
85         res.ptr[1] = b.array[1] + b.array[0];
86         return res;
87     }
88 }
89 unittest
90 {
91     auto A =_mm_setr_pd(1.5, 2.0);
92     auto B =_mm_setr_pd(1.0, 2.0);
93     assert( _mm_hadd_pd(A, B).array ==_mm_setr_pd(3.5, 3.0).array );
94 }
95 
96 version(LDC)
97 {
98     /// Horizontally add adjacent pairs of single-precision (32-bit) 
99     /// floating-point elements in `a` and `b`.
100     __m128 _mm_hadd_ps (__m128 a, __m128 b) pure @safe
101     {
102         static if (__traits(targetHasFeature, "sse3"))
103         {
104             return __builtin_ia32_haddps(a, b);
105         }
106         else
107         {
108             __m128 res;
109             res[0] = a[1] + a[0];
110             res[1] = a[3] + a[2];
111             res[2] = b[1] + b[0];
112             res[3] = b[3] + b[2];
113             return res;
114         }
115     }
116 }
117 else
118 {
119     // PERF: for GDC, detect SSE3 and use the relevant builtin
120 
121     /// Horizontally add adjacent pairs of single-precision (32-bit) 
122     /// floating-point elements in `a` and `b`.
123     __m128 _mm_hadd_ps (__m128 a, __m128 b) pure @trusted
124     {
125         __m128 res;
126         res.ptr[0] = a.array[1] + a.array[0];
127         res.ptr[1] = a.array[3] + a.array[2];
128         res.ptr[2] = b.array[1] + b.array[0];
129         res.ptr[3] = b.array[3] + b.array[2];
130         return res;
131     }
132 }
133 unittest
134 {
135     __m128 A =_mm_setr_ps(1.0f, 2.0f, 3.0f, 5.0f);
136     __m128 B =_mm_setr_ps(1.5f, 2.0f, 3.5f, 4.0f);
137     assert( _mm_hadd_ps(A, B).array == _mm_setr_ps(3.0f, 8.0f, 3.5f, 7.5f).array );
138 }
139 
140 version(LDC)
141 {
142     /// Horizontally subtract adjacent pairs of double-precision (64-bit) 
143     /// floating-point elements in `a` and `b`.
144     __m128d _mm_hsub_pd (__m128d a, __m128d b) pure @safe
145     {
146         static if (__traits(targetHasFeature, "sse3"))
147         {
148             return __builtin_ia32_hsubpd(a, b);
149         }
150         else
151         {
152             __m128d res;
153             res[0] = a[0] - a[1];
154             res[1] = b[0] - b[1];
155             return res;
156         }
157     }
158 }
159 else
160 {
161     /// Horizontally subtract adjacent pairs of double-precision (64-bit) 
162     /// floating-point elements in `a` and `b`.
163     __m128d _mm_hsub_pd (__m128d a, __m128d b) pure @trusted
164     {
165         // On GDC this generates hsubpd with -O1
166         __m128d res;
167         res.ptr[0] = a.array[0] - a.array[1];
168         res.ptr[1] = b.array[0] - b.array[1];
169         return res;
170     }
171 }
172 unittest
173 {
174     auto A =_mm_setr_pd(1.5, 2.0);
175     auto B =_mm_setr_pd(1.0, 2.0);
176     assert( _mm_hsub_pd(A, B).array ==_mm_setr_pd(-0.5, -1.0).array );
177 }
178 
179 version(LDC)
180 {
181     /// Horizontally subtract adjacent pairs of single-precision (32-bit) 
182     /// floating-point elements in `a` and `b`.
183     __m128 _mm_hsub_ps (__m128 a, __m128 b) pure @safe
184     {
185         static if (__traits(targetHasFeature, "sse3"))
186         {
187             return __builtin_ia32_hsubps(a, b);
188         }
189         else
190         {
191             __m128 res;
192             res[0] = a[0] - a[1];
193             res[1] = a[2] - a[3];
194             res[2] = b[0] - b[1];
195             res[3] = b[2] - b[3];
196             return res;
197         }
198     }
199 }
200 else
201 {
202     /// Horizontally subtract adjacent pairs of single-precision (32-bit) 
203     /// floating-point elements in `a` and `b`.
204     __m128 _mm_hsub_ps (__m128 a, __m128 b) pure @trusted
205     {
206         // PERF: GDC doesn't generate the right instruction, do something
207         __m128 res;
208         res.ptr[0] = a.array[0] - a.array[1];
209         res.ptr[1] = a.array[2] - a.array[3];
210         res.ptr[2] = b.array[0] - b.array[1];
211         res.ptr[3] = b.array[2] - b.array[3];
212         return res;
213     }
214 }
215 unittest
216 {
217     __m128 A =_mm_setr_ps(1.0f, 2.0f, 3.0f, 5.0f);
218     __m128 B =_mm_setr_ps(1.5f, 2.0f, 3.5f, 4.0f);
219     assert(_mm_hsub_ps(A, B).array == _mm_setr_ps(-1.0f, -2.0f, -0.5f, -0.5f).array);
220 }
221 
222 /// Load 128-bits of integer data from unaligned memory.
223 // Note: The saying is LDDQU was only ever useful around 2008
224 // See_also: https://stackoverflow.com/questions/38370622/a-faster-integer-sse-unalligned-load-thats-rarely-used
225 alias _mm_lddqu_si128 = _mm_loadu_si128;
226 
227 
228 __m128d _mm_loaddup_pd (const(double)* mem_addr) pure @trusted
229 {
230     // Note: generates movddup since LDC 1.3 with -O1 -mattr=+sse3
231     // Same for GDC with -O1
232     double value = *mem_addr;
233     __m128d res;
234     res.ptr[0] = value;
235     res.ptr[1] = value;
236     return res;
237 }
238 unittest
239 {
240     version(LDC)
241     {
242         double a = 7.5;
243         assert(_mm_loaddup_pd(&a) == _mm_set_pd(7.5, 7.5));
244     }
245     else
246     {
247         double a = 7.5;
248         // For some reason, this line breaks with LDC, but not when isolated!
249         // was not reported yet.
250         assert(_mm_loaddup_pd(&a).array == _mm_set_pd(7.5, 7.5).array);
251     }
252 }
253 
254 __m128d _mm_movedup_pd (__m128d a) pure @trusted
255 {
256     // Note: generates movddup since LDC 1.3 with -O1 -mattr=+sse3
257     // Something efficient with -01 for GDC
258     a.ptr[1] = a.array[0];
259     return a;
260 }
261 unittest
262 {
263     __m128d A = _mm_setr_pd(7.0, 2.5);
264     assert(_mm_movedup_pd(A).array == _mm_set_pd(7.0, 7.0).array);
265 }
266 
267 /// Duplicate odd-indexed single-precision (32-bit) floating-point elements from `a`.
268 __m128 _mm_movehdup_ps (__m128 a) pure @trusted
269 {
270     // Generates movshdup since LDC 1.3 with -O1 -mattr=+sse3
271     // PERF but GDC never generates it
272     a.ptr[0] = a.array[1];
273     a.ptr[2] = a.array[3];
274     return a;
275 }
276 
277 /// Duplicate even-indexed single-precision (32-bit) floating-point elements from `a`.
278 __m128 _mm_moveldup_ps (__m128 a) pure @trusted
279 {
280     // Generates movsldup since LDC 1.3 with -O1 -mattr=+sse3
281     // PERF but GDC never generates it
282     a.ptr[1] = a.array[0];
283     a.ptr[3] = a.array[2];
284     return a;
285 }