1 /**
2 * SSE3 intrinsics.
3 * https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#techs=SSE3
4 *
5 * Copyright: Guillaume Piolat 2016-2020.
6 *            Charles Gregory 2019.
7 * License:   $(LINK2 http://www.boost.org/LICENSE_1_0.txt, Boost License 1.0)
8 */
9 module inteli.pmmintrin;
10 
11 public import inteli.types;
12 import inteli.internals;
13 public import inteli.emmintrin;
14 
15 
16 // Note: this header will work whether you have SSE3 enabled or not.
17 // With LDC, use "dflags-ldc": ["-mattr=+sse3"] or equivalent to actively 
18 // generate SSE3 instruction (they are often enabled with -O1 or greater).
19 // With GDC, use "dflags-gdc": ["-msse3"] or equivalent to generate SSE3 instructions.
20 
21 
22 nothrow @nogc:
23 
24 /// Alternatively add and subtract packed double-precision (64-bit) 
25 /// floating-point elements in `a` to/from packed elements in `b`.
26 __m128d _mm_addsub_pd (__m128d a, __m128d b) pure @trusted
27 {
28     static if (DMD_with_DSIMD_and_SSE3)
29     {
30         return cast(__m128d) __simd(XMM.ADDSUBPD, cast(void16)a, cast(void16)b);
31     }
32     else static if (GDC_with_SSE3)
33     {
34         return __builtin_ia32_addsubpd(a, b);
35     }
36     else static if (LDC_with_SSE3)
37     {
38         return __builtin_ia32_addsubpd(a, b);
39     }
40     else
41     {
42         // ARM: well optimized starting with LDC 1.18.0 -O2, not disrupted by LLVM 13+
43         a.ptr[0] = a.array[0] - b.array[0];
44         a.ptr[1] = a.array[1] + b.array[1];
45         return a;
46     }
47 }
48 unittest
49 {
50     auto v1 =_mm_setr_pd(1.0,2.0);
51     auto v2 =_mm_setr_pd(1.0,2.0);
52     assert(_mm_addsub_pd(v1,v2).array == _mm_setr_pd(0.0,4.0).array);
53 }
54 
55 /// Alternatively add and subtract packed single-precision (32-bit) 
56 /// floating-point elements in `a` to/from packed elements in `b`.
57 float4 _mm_addsub_ps (float4 a, float4 b) pure @trusted
58 {
59     static if (DMD_with_DSIMD_and_SSE3)
60     {
61         return cast(__m128) __simd(XMM.ADDSUBPS, cast(void16)a, cast(void16)b);
62     }
63     else static if (GDC_with_SSE3)
64     {
65         return __builtin_ia32_addsubps(a, b);
66     }
67     else static if (LDC_with_SSE3)
68     {
69         return __builtin_ia32_addsubps(a, b);
70     }
71     else
72     {    
73         a.ptr[0] -= b.array[0];
74         a.ptr[1] += b.array[1];
75         a.ptr[2] -= b.array[2];
76         a.ptr[3] += b.array[3];
77         return a;
78     }
79 }
80 unittest
81 {
82     auto v1 =_mm_setr_ps(1.0f, 2.0f, 3.0f, 4.0f);
83     auto v2 =_mm_setr_ps(1.0f, 2.0f, 3.0f, 4.0f);
84     assert( _mm_addsub_ps(v1,v2).array == _mm_setr_ps(0.0f, 4.0f, 0.0f, 8.0f).array );
85 }
86 
87 
88 /// Horizontally add adjacent pairs of double-precision (64-bit) 
89 /// floating-point elements in `a` and `b`.
90 __m128d _mm_hadd_pd (__m128d a, __m128d b) pure @trusted
91 {
92     // PERF: ARM64?
93     static if (DMD_with_DSIMD_and_SSE3)
94     {
95         return cast(__m128d) __simd(XMM.HADDPD, cast(void16)a, cast(void16)b);
96     }
97     else static if (GDC_or_LDC_with_SSE3)
98     {
99         return __builtin_ia32_haddpd(a, b);
100     }
101     else
102     {
103         __m128d res;
104         res.ptr[0] = a.array[1] + a.array[0];
105         res.ptr[1] = b.array[1] + b.array[0];
106         return res;
107     }
108 }
109 unittest
110 {
111     auto A =_mm_setr_pd(1.5, 2.0);
112     auto B =_mm_setr_pd(1.0, 2.0);
113     assert( _mm_hadd_pd(A, B).array ==_mm_setr_pd(3.5, 3.0).array );
114 }
115 
116 /// Horizontally add adjacent pairs of single-precision (32-bit) 
117 /// floating-point elements in `a` and `b`.
118 __m128 _mm_hadd_ps (__m128 a, __m128 b) pure @trusted
119 {
120     static if (DMD_with_DSIMD_and_SSE3)
121     {
122         return cast(__m128) __simd(XMM.HADDPS, cast(void16)a, cast(void16)b);
123     }
124     else static if (GDC_or_LDC_with_SSE3)
125     {
126         return __builtin_ia32_haddps(a, b);
127     }
128     else static if (LDC_with_ARM64)
129     {
130         return vpaddq_f32(a, b);
131     }
132     else
133     {    
134         __m128 res;
135         res.ptr[0] = a.array[1] + a.array[0];
136         res.ptr[1] = a.array[3] + a.array[2];
137         res.ptr[2] = b.array[1] + b.array[0];
138         res.ptr[3] = b.array[3] + b.array[2];
139         return res;
140     }
141 }
142 unittest
143 {
144     __m128 A =_mm_setr_ps(1.0f, 2.0f, 3.0f, 5.0f);
145     __m128 B =_mm_setr_ps(1.5f, 2.0f, 3.5f, 4.0f);
146     assert( _mm_hadd_ps(A, B).array == _mm_setr_ps(3.0f, 8.0f, 3.5f, 7.5f).array );
147 }
148 
149 /// Horizontally subtract adjacent pairs of double-precision (64-bit) 
150 /// floating-point elements in `a` and `b`.
151 __m128d _mm_hsub_pd (__m128d a, __m128d b) pure @trusted
152 {
153     static if (DMD_with_DSIMD_and_SSE3)
154     {
155         return cast(__m128d) __simd(XMM.HSUBPD, cast(void16)a, cast(void16)b);
156     }
157     else static if (GDC_or_LDC_with_SSE3)
158     {
159         return __builtin_ia32_hsubpd(a, b);
160     }
161     else
162     {
163         // yep, sounds optimal for ARM64 too. Strangely enough.
164         __m128d res;
165         res.ptr[0] = a.array[0] - a.array[1];
166         res.ptr[1] = b.array[0] - b.array[1];
167         return res;
168     }
169 }
170 unittest
171 {
172     auto A =_mm_setr_pd(1.5, 2.0);
173     auto B =_mm_setr_pd(1.0, 2.0);
174     assert( _mm_hsub_pd(A, B).array ==_mm_setr_pd(-0.5, -1.0).array );
175 }
176 
177 /// Horizontally subtract adjacent pairs of single-precision (32-bit) 
178 /// floating-point elements in `a` and `b`.
179 __m128 _mm_hsub_ps (__m128 a, __m128 b) pure @trusted
180 {
181     static if (DMD_with_DSIMD_and_SSE3)
182     {
183         return cast(__m128) __simd(XMM.HSUBPS, cast(void16)a, cast(void16)b);
184     }
185     else static if (GDC_or_LDC_with_SSE3)
186     {
187         return __builtin_ia32_hsubps(a, b);
188     }
189     else static if (LDC_with_ARM64)
190     {
191         int4 mask = [0, 0x80000000, 0, 0x80000000];
192         a = cast(__m128)(cast(int4)a ^ mask);
193         b = cast(__m128)(cast(int4)b ^ mask);
194         return vpaddq_f32(a, b);
195     }
196     else
197     {
198         __m128 res;
199         res.ptr[0] = a.array[0] - a.array[1];
200         res.ptr[1] = a.array[2] - a.array[3];
201         res.ptr[2] = b.array[0] - b.array[1];
202         res.ptr[3] = b.array[2] - b.array[3];
203         return res;
204     }
205 }
206 unittest
207 {
208     __m128 A =_mm_setr_ps(1.0f, 2.0f, 3.0f, 5.0f);
209     __m128 B =_mm_setr_ps(1.5f, 2.0f, 3.5f, 4.0f);
210     assert(_mm_hsub_ps(A, B).array == _mm_setr_ps(-1.0f, -2.0f, -0.5f, -0.5f).array);
211 }
212 
213 /// Load 128-bits of integer data from unaligned memory.
214 // Note: The saying is LDDQU was only ever useful around 2008
215 // See_also: https://stackoverflow.com/questions/38370622/a-faster-integer-sse-unalligned-load-thats-rarely-used
216 alias _mm_lddqu_si128 = _mm_loadu_si128;
217 
218 /// Load a double-precision (64-bit) floating-point element from memory into both elements of result.
219 __m128d _mm_loaddup_pd (const(double)* mem_addr) pure @trusted
220 {
221     // Note: generates movddup since LDC 1.3 with -O1 -mattr=+sse3
222     // Same for GDC with -O1
223     double value = *mem_addr;
224     __m128d res;
225     res.ptr[0] = value;
226     res.ptr[1] = value;
227     return res;
228 }
229 unittest
230 {
231     version(LDC)
232     {
233         double a = 7.5;
234         assert(_mm_loaddup_pd(&a) == _mm_set_pd(7.5, 7.5));
235     }
236     else
237     {
238         double a = 7.5;
239         // For some reason, this line used to break with LDC, but not when isolated! Was never reported.
240         assert(_mm_loaddup_pd(&a).array == _mm_set_pd(7.5, 7.5).array);
241     }
242 }
243 
244 /// Duplicate the low double-precision (64-bit) floating-point element from `a`.
245 __m128d _mm_movedup_pd (__m128d a) pure @trusted
246 {
247     // Note: generates movddup since LDC 1.3 with -O1 -mattr=+sse3
248     // Something efficient with -01 for GDC
249     a.ptr[1] = a.array[0];
250     return a;
251 }
252 unittest
253 {
254     __m128d A = _mm_setr_pd(7.0, 2.5);
255     assert(_mm_movedup_pd(A).array == _mm_set_pd(7.0, 7.0).array);
256 }
257 
258 /// Duplicate odd-indexed single-precision (32-bit) floating-point elements from `a`.
259 __m128 _mm_movehdup_ps (__m128 a) pure @trusted
260 {
261     static if (GDC_with_SSE3)
262     {
263         return __builtin_ia32_movshdup (a);
264     }
265     else
266     {
267         // Generates movshdup since LDC 1.3 with -O1 -mattr=+sse3
268         a.ptr[0] = a.array[1];
269         a.ptr[2] = a.array[3];
270         return a;
271     }
272     
273 }
274 unittest
275 {
276     __m128 A = _mm_movehdup_ps(_mm_setr_ps(1, 2, 3, 4));
277     float[4] correct = [2.0f, 2, 4, 4 ];
278     assert(A.array == correct);
279 }
280 
281 /// Duplicate even-indexed single-precision (32-bit) floating-point elements from `a`.
282 __m128 _mm_moveldup_ps (__m128 a) pure @trusted
283 {
284     static if (GDC_with_SSE3)
285     {
286         return __builtin_ia32_movsldup (a);
287     }
288     else
289     {
290         // Generates movsldup since LDC 1.3 with -O1 -mattr=+sse3
291         a.ptr[1] = a.array[0];
292         a.ptr[3] = a.array[2];
293         return a;
294     }
295 }
296 unittest
297 {
298     __m128 A = _mm_moveldup_ps(_mm_setr_ps(1, 2, 3, 4));
299     float[4] correct = [1.0f, 1, 3, 3 ];
300     assert(A.array == correct);
301 }