1 /**
2 * SSE3 intrinsics.
3 *
4 * Copyright: Guillaume Piolat 2016-2019.
5 *            Charles Gregory 2019.
6 * License:   $(LINK2 http://www.boost.org/LICENSE_1_0.txt, Boost License 1.0)
7 * Authors:   Guillaume Piolat
8 */
9 module inteli.pmmintrin;
10 
11 public import inteli.types;
12 import inteli.internals;
13 import inteli.emmintrin;
14 
15 
16 // Note: this header will work whether you have SSE3 enabled or not.
17 // With LDC, use "dflags-ldc": ["-mattr=+sse3"] or equivalent to actively 
18 // generate SSE3 instruction (they are often enabled with -O1 or greater).
19 
20 
21 nothrow @nogc:
22 
23 /// Alternatively add and subtract packed double-precision (64-bit) 
24 /// floating-point elements in `a` to/from packed elements in `b`.
25 __m128d _mm_addsub_pd (__m128d a, __m128d b) pure @trusted
26 {
27     // Note: generates addsubpd since LDC 1.3.0 with -O1
28     // PERF: for GDC, detect SSE3 and use the relevant builtin, because it doesn't generates addsubpd
29     // ARM: well optimized starting with LDC 1.18.0 -O2
30     a.ptr[0] = a.array[0] - b.array[0];
31     a.ptr[1] = a.array[1] + b.array[1];
32     return a;
33 }
34 unittest
35 {
36     auto v1 =_mm_setr_pd(1.0,2.0);
37     auto v2 =_mm_setr_pd(1.0,2.0);
38     assert(_mm_addsub_pd(v1,v2).array == _mm_setr_pd(0.0,4.0).array);
39 }
40 
41 /// Alternatively add and subtract packed single-precision (32-bit) 
42 /// floating-point elements in `a` to/from packed elements in `b`.
43 float4 _mm_addsub_ps (float4 a, float4 b) pure @trusted
44 {
45     // Note: generates addsubps since LDC 1.3.0 with -O1
46     // PERF: for GDC, detect SSE3 and use the relevant builtin
47     a.ptr[0] -= b.array[0];
48     a.ptr[1] += b.array[1];
49     a.ptr[2] -= b.array[2];
50     a.ptr[3] += b.array[3];
51     return a;
52 }
53 unittest
54 {
55     auto v1 =_mm_setr_ps(1.0f, 2.0f, 3.0f, 4.0f);
56     auto v2 =_mm_setr_ps(1.0f, 2.0f, 3.0f, 4.0f);
57     assert( _mm_addsub_ps(v1,v2).array == _mm_setr_ps(0.0f, 4.0f, 0.0f, 8.0f).array );
58 }
59 
60 static if (LDC_with_SSE3)
61 {
62     /// Horizontally add adjacent pairs of double-precision (64-bit) 
63     /// floating-point elements in `a` and `b`.
64     __m128d _mm_hadd_pd (__m128d a, __m128d b) pure @safe
65     {
66         return __builtin_ia32_haddpd(a, b);
67     }
68 }
69 else
70 {
71     /// Horizontally add adjacent pairs of double-precision (64-bit) 
72     /// floating-point elements in `a` and `b`.
73     __m128d _mm_hadd_pd (__m128d a, __m128d b) pure @trusted
74     {
75         // On GDC this generates haddpd with -O1
76         __m128d res;
77         res.ptr[0] = a.array[1] + a.array[0];
78         res.ptr[1] = b.array[1] + b.array[0];
79         return res;
80     }
81 }
82 unittest
83 {
84     auto A =_mm_setr_pd(1.5, 2.0);
85     auto B =_mm_setr_pd(1.0, 2.0);
86     assert( _mm_hadd_pd(A, B).array ==_mm_setr_pd(3.5, 3.0).array );
87 }
88 
89 static if (LDC_with_SSE3)
90 {
91     /// Horizontally add adjacent pairs of single-precision (32-bit) 
92     /// floating-point elements in `a` and `b`.
93     __m128 _mm_hadd_ps (__m128 a, __m128 b) pure @safe
94     {
95     
96         return __builtin_ia32_haddps(a, b);
97     }
98 }
99 else static if (LDC_with_ARM64)
100 {
101     float4 _mm_hadd_ps (float4 a, float4 b) pure @safe
102     {
103         return vpaddq_f32(a, b);
104     }
105 }
106 else
107 {
108     // PERF: for GDC, detect SSE3 and use the relevant builtin
109 
110     /// Horizontally add adjacent pairs of single-precision (32-bit) 
111     /// floating-point elements in `a` and `b`.
112     __m128 _mm_hadd_ps (__m128 a, __m128 b) pure @trusted
113     {
114         __m128 res;
115         res.ptr[0] = a.array[1] + a.array[0];
116         res.ptr[1] = a.array[3] + a.array[2];
117         res.ptr[2] = b.array[1] + b.array[0];
118         res.ptr[3] = b.array[3] + b.array[2];
119         return res;
120     }
121 }
122 unittest
123 {
124     __m128 A =_mm_setr_ps(1.0f, 2.0f, 3.0f, 5.0f);
125     __m128 B =_mm_setr_ps(1.5f, 2.0f, 3.5f, 4.0f);
126     assert( _mm_hadd_ps(A, B).array == _mm_setr_ps(3.0f, 8.0f, 3.5f, 7.5f).array );
127 }
128 
129 static if (LDC_with_SSE3)
130 {
131     /// Horizontally subtract adjacent pairs of double-precision (64-bit) 
132     /// floating-point elements in `a` and `b`.
133     __m128d _mm_hsub_pd (__m128d a, __m128d b) pure @safe
134     {
135         return __builtin_ia32_hsubpd(a, b);
136     }
137 }
138 else
139 {
140     /// Horizontally subtract adjacent pairs of double-precision (64-bit) 
141     /// floating-point elements in `a` and `b`.
142     __m128d _mm_hsub_pd (__m128d a, __m128d b) pure @trusted
143     {
144         // On GDC this generates hsubpd with -O1
145         __m128d res;
146         res.ptr[0] = a.array[0] - a.array[1];
147         res.ptr[1] = b.array[0] - b.array[1];
148         return res;
149     }
150 }
151 unittest
152 {
153     auto A =_mm_setr_pd(1.5, 2.0);
154     auto B =_mm_setr_pd(1.0, 2.0);
155     assert( _mm_hsub_pd(A, B).array ==_mm_setr_pd(-0.5, -1.0).array );
156 }
157 
158 static if (LDC_with_SSE3)
159 {
160     /// Horizontally subtract adjacent pairs of single-precision (32-bit) 
161     /// floating-point elements in `a` and `b`.
162     __m128 _mm_hsub_ps (__m128 a, __m128 b) pure @safe
163     {
164         return __builtin_ia32_hsubps(a, b);
165     }
166 }
167 else static if (LDC_with_ARM64)
168 {
169     /// Horizontally subtract adjacent pairs of single-precision (32-bit) 
170     /// floating-point elements in `a` and `b`.
171     float4 _mm_hsub_ps (float4 a, float4 b) pure @safe
172     {
173         int4 mask = [0, 0x80000000, 0, 0x80000000];
174         a = cast(__m128)(cast(int4)a ^ mask);
175         b = cast(__m128)(cast(int4)b ^ mask);
176         return vpaddq_f32(a, b);
177     }
178 }
179 else
180 {
181     /// Horizontally subtract adjacent pairs of single-precision (32-bit) 
182     /// floating-point elements in `a` and `b`.
183     __m128 _mm_hsub_ps (__m128 a, __m128 b) pure @trusted
184     {
185         // PERF: GDC doesn't generate the right instruction, do something
186         __m128 res;
187         res.ptr[0] = a.array[0] - a.array[1];
188         res.ptr[1] = a.array[2] - a.array[3];
189         res.ptr[2] = b.array[0] - b.array[1];
190         res.ptr[3] = b.array[2] - b.array[3];
191         return res;
192     }
193 }
194 unittest
195 {
196     __m128 A =_mm_setr_ps(1.0f, 2.0f, 3.0f, 5.0f);
197     __m128 B =_mm_setr_ps(1.5f, 2.0f, 3.5f, 4.0f);
198     assert(_mm_hsub_ps(A, B).array == _mm_setr_ps(-1.0f, -2.0f, -0.5f, -0.5f).array);
199 }
200 
201 /// Load 128-bits of integer data from unaligned memory.
202 // Note: The saying is LDDQU was only ever useful around 2008
203 // See_also: https://stackoverflow.com/questions/38370622/a-faster-integer-sse-unalligned-load-thats-rarely-used
204 alias _mm_lddqu_si128 = _mm_loadu_si128;
205 
206 
207 __m128d _mm_loaddup_pd (const(double)* mem_addr) pure @trusted
208 {
209     // Note: generates movddup since LDC 1.3 with -O1 -mattr=+sse3
210     // Same for GDC with -O1
211     double value = *mem_addr;
212     __m128d res;
213     res.ptr[0] = value;
214     res.ptr[1] = value;
215     return res;
216 }
217 unittest
218 {
219     version(LDC)
220     {
221         double a = 7.5;
222         assert(_mm_loaddup_pd(&a) == _mm_set_pd(7.5, 7.5));
223     }
224     else
225     {
226         double a = 7.5;
227         // For some reason, this line breaks with LDC, but not when isolated!
228         // was not reported yet.
229         assert(_mm_loaddup_pd(&a).array == _mm_set_pd(7.5, 7.5).array);
230     }
231 }
232 
233 __m128d _mm_movedup_pd (__m128d a) pure @trusted
234 {
235     // Note: generates movddup since LDC 1.3 with -O1 -mattr=+sse3
236     // Something efficient with -01 for GDC
237     a.ptr[1] = a.array[0];
238     return a;
239 }
240 unittest
241 {
242     __m128d A = _mm_setr_pd(7.0, 2.5);
243     assert(_mm_movedup_pd(A).array == _mm_set_pd(7.0, 7.0).array);
244 }
245 
246 /// Duplicate odd-indexed single-precision (32-bit) floating-point elements from `a`.
247 __m128 _mm_movehdup_ps (__m128 a) pure @trusted
248 {
249     // Generates movshdup since LDC 1.3 with -O1 -mattr=+sse3
250     // PERF but GDC never generates it
251     a.ptr[0] = a.array[1];
252     a.ptr[2] = a.array[3];
253     return a;
254 }
255 
256 /// Duplicate even-indexed single-precision (32-bit) floating-point elements from `a`.
257 __m128 _mm_moveldup_ps (__m128 a) pure @trusted
258 {
259     // Generates movsldup since LDC 1.3 with -O1 -mattr=+sse3
260     // PERF but GDC never generates it
261     a.ptr[1] = a.array[0];
262     a.ptr[3] = a.array[2];
263     return a;
264 }