1 /**
2 * SSE3 intrinsics.
3 * https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#techs=SSE3
4 *
5 * Copyright: Guillaume Piolat 2016-2020.
6 *            Charles Gregory 2019.
7 * License:   $(LINK2 http://www.boost.org/LICENSE_1_0.txt, Boost License 1.0)
8 */
9 module inteli.pmmintrin;
10 
11 public import inteli.types;
12 import inteli.internals;
13 public import inteli.emmintrin;
14 
15 
16 // Note: this header will work whether you have SSE3 enabled or not.
17 // With LDC, use "dflags-ldc": ["-mattr=+sse3"] or equivalent to actively 
18 // generate SSE3 instruction (they are often enabled with -O1 or greater).
19 // With GDC, use "dflags-gdc": ["-msse3"] or equivalent to generate SSE3 instructions.
20 
21 
22 nothrow @nogc:
23 
24 /// Alternatively add and subtract packed double-precision (64-bit) 
25 /// floating-point elements in `a` to/from packed elements in `b`.
26 __m128d _mm_addsub_pd (__m128d a, __m128d b) pure @trusted
27 {
28     // PERF DMD
29     static if (GDC_with_SSE3)
30     {
31         return __builtin_ia32_addsubpd(a, b);
32     }
33     else static if (LDC_with_SSE3)
34     {
35         return __builtin_ia32_addsubpd(a, b);
36     }
37     else
38     {
39         // ARM: well optimized starting with LDC 1.18.0 -O2, not disrupted by LLVM 13+
40         a.ptr[0] = a.array[0] - b.array[0];
41         a.ptr[1] = a.array[1] + b.array[1];
42         return a;
43     }
44 }
45 unittest
46 {
47     auto v1 =_mm_setr_pd(1.0,2.0);
48     auto v2 =_mm_setr_pd(1.0,2.0);
49     assert(_mm_addsub_pd(v1,v2).array == _mm_setr_pd(0.0,4.0).array);
50 }
51 
52 /// Alternatively add and subtract packed single-precision (32-bit) 
53 /// floating-point elements in `a` to/from packed elements in `b`.
54 float4 _mm_addsub_ps (float4 a, float4 b) pure @trusted
55 {
56     // PERF DMD
57     static if (GDC_with_SSE3)
58     {
59         return __builtin_ia32_addsubps(a, b);
60     }
61     else static if (LDC_with_SSE3)
62     {
63         return __builtin_ia32_addsubps(a, b);
64     }
65     else
66     {    
67         a.ptr[0] -= b.array[0];
68         a.ptr[1] += b.array[1];
69         a.ptr[2] -= b.array[2];
70         a.ptr[3] += b.array[3];
71         return a;
72     }
73 }
74 unittest
75 {
76     auto v1 =_mm_setr_ps(1.0f, 2.0f, 3.0f, 4.0f);
77     auto v2 =_mm_setr_ps(1.0f, 2.0f, 3.0f, 4.0f);
78     assert( _mm_addsub_ps(v1,v2).array == _mm_setr_ps(0.0f, 4.0f, 0.0f, 8.0f).array );
79 }
80 
81 
82 /// Horizontally add adjacent pairs of double-precision (64-bit) 
83 /// floating-point elements in `a` and `b`.
84 __m128d _mm_hadd_pd (__m128d a, __m128d b) pure @trusted
85 {
86     static if (LDC_with_SSE3)
87     {
88         return __builtin_ia32_haddpd(a, b);
89     }
90     else
91     {
92        // On GDC this generates haddpd with -O1
93         __m128d res; // PERF =void;
94         res.ptr[0] = a.array[1] + a.array[0];
95         res.ptr[1] = b.array[1] + b.array[0];
96         return res;
97     }
98 }
99 unittest
100 {
101     auto A =_mm_setr_pd(1.5, 2.0);
102     auto B =_mm_setr_pd(1.0, 2.0);
103     assert( _mm_hadd_pd(A, B).array ==_mm_setr_pd(3.5, 3.0).array );
104 }
105 
106 // PERF: for GDC, detect SSE3 and use the relevant builtin
107 /// Horizontally add adjacent pairs of single-precision (32-bit) 
108 /// floating-point elements in `a` and `b`.
109 __m128 _mm_hadd_ps (__m128 a, __m128 b) pure @trusted
110 {
111     static if (LDC_with_SSE3)
112     {
113         return __builtin_ia32_haddps(a, b);
114     }
115     else static if (LDC_with_ARM64)
116     {
117         return vpaddq_f32(a, b);
118     }
119     else
120     {    
121         __m128 res; // PERF =void;
122         res.ptr[0] = a.array[1] + a.array[0];
123         res.ptr[1] = a.array[3] + a.array[2];
124         res.ptr[2] = b.array[1] + b.array[0];
125         res.ptr[3] = b.array[3] + b.array[2];
126         return res;
127     }
128 }
129 unittest
130 {
131     __m128 A =_mm_setr_ps(1.0f, 2.0f, 3.0f, 5.0f);
132     __m128 B =_mm_setr_ps(1.5f, 2.0f, 3.5f, 4.0f);
133     assert( _mm_hadd_ps(A, B).array == _mm_setr_ps(3.0f, 8.0f, 3.5f, 7.5f).array );
134 }
135 
136 /// Horizontally subtract adjacent pairs of double-precision (64-bit) 
137 /// floating-point elements in `a` and `b`.
138 __m128d _mm_hsub_pd (__m128d a, __m128d b) pure @trusted
139 {
140     static if (LDC_with_SSE3)
141     {
142         return __builtin_ia32_hsubpd(a, b);
143     }
144     else
145     {        
146         // On GDC this generates hsubpd with -O1
147         __m128d res; // PERF =void;
148         res.ptr[0] = a.array[0] - a.array[1];
149         res.ptr[1] = b.array[0] - b.array[1];
150         return res;
151     }
152 }
153 unittest
154 {
155     auto A =_mm_setr_pd(1.5, 2.0);
156     auto B =_mm_setr_pd(1.0, 2.0);
157     assert( _mm_hsub_pd(A, B).array ==_mm_setr_pd(-0.5, -1.0).array );
158 }
159 
160 /// Horizontally subtract adjacent pairs of single-precision (32-bit) 
161 /// floating-point elements in `a` and `b`.
162 __m128 _mm_hsub_ps (__m128 a, __m128 b) pure @trusted
163 {
164     static if (LDC_with_SSE3)
165     {
166         return __builtin_ia32_hsubps(a, b);
167     }
168     else static if (LDC_with_ARM64)
169     {
170         int4 mask = [0, 0x80000000, 0, 0x80000000];
171         a = cast(__m128)(cast(int4)a ^ mask);
172         b = cast(__m128)(cast(int4)b ^ mask);
173         return vpaddq_f32(a, b);
174     }
175     else
176     {
177         // PERF: GDC doesn't generate the right instruction, do something
178         __m128 res; // PERF =void;
179         res.ptr[0] = a.array[0] - a.array[1];
180         res.ptr[1] = a.array[2] - a.array[3];
181         res.ptr[2] = b.array[0] - b.array[1];
182         res.ptr[3] = b.array[2] - b.array[3];
183         return res;
184     }
185 }
186 unittest
187 {
188     __m128 A =_mm_setr_ps(1.0f, 2.0f, 3.0f, 5.0f);
189     __m128 B =_mm_setr_ps(1.5f, 2.0f, 3.5f, 4.0f);
190     assert(_mm_hsub_ps(A, B).array == _mm_setr_ps(-1.0f, -2.0f, -0.5f, -0.5f).array);
191 }
192 
193 /// Load 128-bits of integer data from unaligned memory.
194 // Note: The saying is LDDQU was only ever useful around 2008
195 // See_also: https://stackoverflow.com/questions/38370622/a-faster-integer-sse-unalligned-load-thats-rarely-used
196 alias _mm_lddqu_si128 = _mm_loadu_si128;
197 
198 /// Load a double-precision (64-bit) floating-point element from memory into both elements of result.
199 __m128d _mm_loaddup_pd (const(double)* mem_addr) pure @trusted
200 {
201     // Note: generates movddup since LDC 1.3 with -O1 -mattr=+sse3
202     // Same for GDC with -O1
203     double value = *mem_addr;
204     __m128d res; // PERF =void;
205     res.ptr[0] = value;
206     res.ptr[1] = value;
207     return res;
208 }
209 unittest
210 {
211     version(LDC)
212     {
213         double a = 7.5;
214         assert(_mm_loaddup_pd(&a) == _mm_set_pd(7.5, 7.5));
215     }
216     else
217     {
218         double a = 7.5;
219         // For some reason, this line used to break with LDC, but not when isolated! Was never reported.
220         assert(_mm_loaddup_pd(&a).array == _mm_set_pd(7.5, 7.5).array);
221     }
222 }
223 
224 /// Duplicate the low double-precision (64-bit) floating-point element from `a`.
225 __m128d _mm_movedup_pd (__m128d a) pure @trusted
226 {
227     // Note: generates movddup since LDC 1.3 with -O1 -mattr=+sse3
228     // Something efficient with -01 for GDC
229     a.ptr[1] = a.array[0];
230     return a;
231 }
232 unittest
233 {
234     __m128d A = _mm_setr_pd(7.0, 2.5);
235     assert(_mm_movedup_pd(A).array == _mm_set_pd(7.0, 7.0).array);
236 }
237 
238 /// Duplicate odd-indexed single-precision (32-bit) floating-point elements from `a`.
239 __m128 _mm_movehdup_ps (__m128 a) pure @trusted
240 {
241     // Generates movshdup since LDC 1.3 with -O1 -mattr=+sse3
242     // PERF but GDC never generates it
243     a.ptr[0] = a.array[1];
244     a.ptr[2] = a.array[3];
245     return a;
246 }
247 unittest
248 {
249     __m128 A = _mm_movehdup_ps(_mm_setr_ps(1, 2, 3, 4));
250     float[4] correct = [2.0f, 2, 4, 4 ];
251     assert(A.array == correct);
252 }
253 
254 /// Duplicate even-indexed single-precision (32-bit) floating-point elements from `a`.
255 __m128 _mm_moveldup_ps (__m128 a) pure @trusted
256 {
257     // Generates movsldup since LDC 1.3 with -O1 -mattr=+sse3
258     // PERF but GDC never generates it
259     a.ptr[1] = a.array[0];
260     a.ptr[3] = a.array[2];
261     return a;
262 }
263 unittest
264 {
265     __m128 A = _mm_moveldup_ps(_mm_setr_ps(1, 2, 3, 4));
266     float[4] correct = [1.0f, 1, 3, 3 ];
267     assert(A.array == correct);
268 }