1 /**
2 * Copyright: Guillaume Piolat 2016-2019.
3 *            Charles Gregory 2019.
4 * License:   $(LINK2 http://www.boost.org/LICENSE_1_0.txt, Boost License 1.0)
5 * Authors:   Guillaume Piolat
6 */
7 module inteli.pmmintrin;
8 
9 public import inteli.types;
10 import inteli.internals;
11 import inteli.emmintrin;
12 
13 
14 // Note: this header will work whether you have SSE3 enabled or not.
15 // With LDC, use "dflags-ldc": ["-mattr=+sse3"] or equivalent to actively 
16 // generate SSE3 instruction (they are often enabled with -O1 or greater).
17 
18 
19 nothrow @nogc:
20 
21 /// Alternatively add and subtract packed double-precision (64-bit) 
22 /// floating-point elements in `a` to/from packed elements in `b`.
23 __m128d _mm_addsub_pd (__m128d a, __m128d b) pure @trusted
24 {
25     // Note: generates addsubpd since LDC 1.3.0 with -O1
26     // PERF: for GDC, detect SSE3 and use the relevant builtin, because it doesn't generates addsubpd
27     // ARM: well optimized starting with LDC 1.18.0 -O2
28     a.ptr[0] = a.array[0] - b.array[0];
29     a.ptr[1] = a.array[1] + b.array[1];
30     return a;
31 }
32 unittest
33 {
34     auto v1 =_mm_setr_pd(1.0,2.0);
35     auto v2 =_mm_setr_pd(1.0,2.0);
36     assert(_mm_addsub_pd(v1,v2).array == _mm_setr_pd(0.0,4.0).array);
37 }
38 
39 /// Alternatively add and subtract packed single-precision (32-bit) 
40 /// floating-point elements in `a` to/from packed elements in `b`.
41 float4 _mm_addsub_ps (float4 a, float4 b) pure @trusted
42 {
43     // Note: generates addsubps since LDC 1.3.0 with -O1
44     // PERF: for GDC, detect SSE3 and use the relevant builtin
45     a.ptr[0] -= b.array[0];
46     a.ptr[1] += b.array[1];
47     a.ptr[2] -= b.array[2];
48     a.ptr[3] += b.array[3];
49     return a;
50 }
51 unittest
52 {
53     auto v1 =_mm_setr_ps(1.0f, 2.0f, 3.0f, 4.0f);
54     auto v2 =_mm_setr_ps(1.0f, 2.0f, 3.0f, 4.0f);
55     assert( _mm_addsub_ps(v1,v2).array == _mm_setr_ps(0.0f, 4.0f, 0.0f, 8.0f).array );
56 }
57 
58 static if (LDC_with_SSE3)
59 {
60     /// Horizontally add adjacent pairs of double-precision (64-bit) 
61     /// floating-point elements in `a` and `b`.
62     __m128d _mm_hadd_pd (__m128d a, __m128d b) pure @safe
63     {
64         return __builtin_ia32_haddpd(a, b);
65     }
66 }
67 else
68 {
69     /// Horizontally add adjacent pairs of double-precision (64-bit) 
70     /// floating-point elements in `a` and `b`.
71     __m128d _mm_hadd_pd (__m128d a, __m128d b) pure @trusted
72     {
73         // On GDC this generates haddpd with -O1
74         __m128d res;
75         res.ptr[0] = a.array[1] + a.array[0];
76         res.ptr[1] = b.array[1] + b.array[0];
77         return res;
78     }
79 }
80 unittest
81 {
82     auto A =_mm_setr_pd(1.5, 2.0);
83     auto B =_mm_setr_pd(1.0, 2.0);
84     assert( _mm_hadd_pd(A, B).array ==_mm_setr_pd(3.5, 3.0).array );
85 }
86 
87 static if (LDC_with_SSE3)
88 {
89     /// Horizontally add adjacent pairs of single-precision (32-bit) 
90     /// floating-point elements in `a` and `b`.
91     __m128 _mm_hadd_ps (__m128 a, __m128 b) pure @safe
92     {
93     
94         return __builtin_ia32_haddps(a, b);
95     }
96 }
97 else
98 {
99     // PERF: for GDC, detect SSE3 and use the relevant builtin
100 
101     /// Horizontally add adjacent pairs of single-precision (32-bit) 
102     /// floating-point elements in `a` and `b`.
103     __m128 _mm_hadd_ps (__m128 a, __m128 b) pure @trusted
104     {
105         // TODO: quite bad on #ARM
106         __m128 res;
107         res.ptr[0] = a.array[1] + a.array[0];
108         res.ptr[1] = a.array[3] + a.array[2];
109         res.ptr[2] = b.array[1] + b.array[0];
110         res.ptr[3] = b.array[3] + b.array[2];
111         return res;
112     }
113 }
114 unittest
115 {
116     __m128 A =_mm_setr_ps(1.0f, 2.0f, 3.0f, 5.0f);
117     __m128 B =_mm_setr_ps(1.5f, 2.0f, 3.5f, 4.0f);
118     assert( _mm_hadd_ps(A, B).array == _mm_setr_ps(3.0f, 8.0f, 3.5f, 7.5f).array );
119 }
120 
121 static if (LDC_with_SSE3)
122 {
123     /// Horizontally subtract adjacent pairs of double-precision (64-bit) 
124     /// floating-point elements in `a` and `b`.
125     __m128d _mm_hsub_pd (__m128d a, __m128d b) pure @safe
126     {
127         return __builtin_ia32_hsubpd(a, b);
128     }
129 }
130 else
131 {
132     /// Horizontally subtract adjacent pairs of double-precision (64-bit) 
133     /// floating-point elements in `a` and `b`.
134     __m128d _mm_hsub_pd (__m128d a, __m128d b) pure @trusted
135     {
136         // On GDC this generates hsubpd with -O1
137         __m128d res;
138         res.ptr[0] = a.array[0] - a.array[1];
139         res.ptr[1] = b.array[0] - b.array[1];
140         return res;
141     }
142 }
143 unittest
144 {
145     auto A =_mm_setr_pd(1.5, 2.0);
146     auto B =_mm_setr_pd(1.0, 2.0);
147     assert( _mm_hsub_pd(A, B).array ==_mm_setr_pd(-0.5, -1.0).array );
148 }
149 
150 static if (LDC_with_SSE3)
151 {
152     /// Horizontally subtract adjacent pairs of single-precision (32-bit) 
153     /// floating-point elements in `a` and `b`.
154     __m128 _mm_hsub_ps (__m128 a, __m128 b) pure @safe
155     {
156         return __builtin_ia32_hsubps(a, b);
157     }
158 }
159 else
160 {
161     /// Horizontally subtract adjacent pairs of single-precision (32-bit) 
162     /// floating-point elements in `a` and `b`.
163     __m128 _mm_hsub_ps (__m128 a, __m128 b) pure @trusted
164     {
165         // PERF: GDC doesn't generate the right instruction, do something
166         // TODO: quite bad on #ARM
167         __m128 res;
168         res.ptr[0] = a.array[0] - a.array[1];
169         res.ptr[1] = a.array[2] - a.array[3];
170         res.ptr[2] = b.array[0] - b.array[1];
171         res.ptr[3] = b.array[2] - b.array[3];
172         return res;
173     }
174 }
175 unittest
176 {
177     __m128 A =_mm_setr_ps(1.0f, 2.0f, 3.0f, 5.0f);
178     __m128 B =_mm_setr_ps(1.5f, 2.0f, 3.5f, 4.0f);
179     assert(_mm_hsub_ps(A, B).array == _mm_setr_ps(-1.0f, -2.0f, -0.5f, -0.5f).array);
180 }
181 
182 /// Load 128-bits of integer data from unaligned memory.
183 // Note: The saying is LDDQU was only ever useful around 2008
184 // See_also: https://stackoverflow.com/questions/38370622/a-faster-integer-sse-unalligned-load-thats-rarely-used
185 alias _mm_lddqu_si128 = _mm_loadu_si128;
186 
187 
188 __m128d _mm_loaddup_pd (const(double)* mem_addr) pure @trusted
189 {
190     // Note: generates movddup since LDC 1.3 with -O1 -mattr=+sse3
191     // Same for GDC with -O1
192     double value = *mem_addr;
193     __m128d res;
194     res.ptr[0] = value;
195     res.ptr[1] = value;
196     return res;
197 }
198 unittest
199 {
200     version(LDC)
201     {
202         double a = 7.5;
203         assert(_mm_loaddup_pd(&a) == _mm_set_pd(7.5, 7.5));
204     }
205     else
206     {
207         double a = 7.5;
208         // For some reason, this line breaks with LDC, but not when isolated!
209         // was not reported yet.
210         assert(_mm_loaddup_pd(&a).array == _mm_set_pd(7.5, 7.5).array);
211     }
212 }
213 
214 __m128d _mm_movedup_pd (__m128d a) pure @trusted
215 {
216     // Note: generates movddup since LDC 1.3 with -O1 -mattr=+sse3
217     // Something efficient with -01 for GDC
218     a.ptr[1] = a.array[0];
219     return a;
220 }
221 unittest
222 {
223     __m128d A = _mm_setr_pd(7.0, 2.5);
224     assert(_mm_movedup_pd(A).array == _mm_set_pd(7.0, 7.0).array);
225 }
226 
227 /// Duplicate odd-indexed single-precision (32-bit) floating-point elements from `a`.
228 __m128 _mm_movehdup_ps (__m128 a) pure @trusted
229 {
230     // Generates movshdup since LDC 1.3 with -O1 -mattr=+sse3
231     // PERF but GDC never generates it
232     a.ptr[0] = a.array[1];
233     a.ptr[2] = a.array[3];
234     return a;
235 }
236 
237 /// Duplicate even-indexed single-precision (32-bit) floating-point elements from `a`.
238 __m128 _mm_moveldup_ps (__m128 a) pure @trusted
239 {
240     // Generates movsldup since LDC 1.3 with -O1 -mattr=+sse3
241     // PERF but GDC never generates it
242     a.ptr[1] = a.array[0];
243     a.ptr[3] = a.array[2];
244     return a;
245 }