1 /**
2 * SSE3 intrinsics.
3 *
4 * Copyright: Guillaume Piolat 2016-2020.
5 *            Charles Gregory 2019.
6 * License:   $(LINK2 http://www.boost.org/LICENSE_1_0.txt, Boost License 1.0)
7 */
8 module inteli.pmmintrin;
9 
10 public import inteli.types;
11 import inteli.internals;
12 public import inteli.emmintrin;
13 
14 
15 // Note: this header will work whether you have SSE3 enabled or not.
16 // With LDC, use "dflags-ldc": ["-mattr=+sse3"] or equivalent to actively 
17 // generate SSE3 instruction (they are often enabled with -O1 or greater).
18 
19 
20 nothrow @nogc:
21 
22 /// Alternatively add and subtract packed double-precision (64-bit) 
23 /// floating-point elements in `a` to/from packed elements in `b`.
24 __m128d _mm_addsub_pd (__m128d a, __m128d b) pure @trusted
25 {
26     // Note: generates addsubpd since LDC 1.3.0 with -O1
27     // PERF: for GDC, detect SSE3 and use the relevant builtin, because it doesn't generates addsubpd
28     // ARM: well optimized starting with LDC 1.18.0 -O2
29     a.ptr[0] = a.array[0] - b.array[0];
30     a.ptr[1] = a.array[1] + b.array[1];
31     return a;
32 }
33 unittest
34 {
35     auto v1 =_mm_setr_pd(1.0,2.0);
36     auto v2 =_mm_setr_pd(1.0,2.0);
37     assert(_mm_addsub_pd(v1,v2).array == _mm_setr_pd(0.0,4.0).array);
38 }
39 
40 /// Alternatively add and subtract packed single-precision (32-bit) 
41 /// floating-point elements in `a` to/from packed elements in `b`.
42 float4 _mm_addsub_ps (float4 a, float4 b) pure @trusted
43 {
44     // Note: generates addsubps since LDC 1.3.0 with -O1
45     // PERF: for GDC, detect SSE3 and use the relevant builtin
46     a.ptr[0] -= b.array[0];
47     a.ptr[1] += b.array[1];
48     a.ptr[2] -= b.array[2];
49     a.ptr[3] += b.array[3];
50     return a;
51 }
52 unittest
53 {
54     auto v1 =_mm_setr_ps(1.0f, 2.0f, 3.0f, 4.0f);
55     auto v2 =_mm_setr_ps(1.0f, 2.0f, 3.0f, 4.0f);
56     assert( _mm_addsub_ps(v1,v2).array == _mm_setr_ps(0.0f, 4.0f, 0.0f, 8.0f).array );
57 }
58 
59 
60 /// Horizontally add adjacent pairs of double-precision (64-bit) 
61 /// floating-point elements in `a` and `b`.
62 __m128d _mm_hadd_pd (__m128d a, __m128d b) pure @trusted
63 {
64     static if (LDC_with_SSE3)
65     {
66         return __builtin_ia32_haddpd(a, b);
67     }
68     else
69     {
70        // On GDC this generates haddpd with -O1
71         __m128d res;
72         res.ptr[0] = a.array[1] + a.array[0];
73         res.ptr[1] = b.array[1] + b.array[0];
74         return res;
75     }
76 }
77 unittest
78 {
79     auto A =_mm_setr_pd(1.5, 2.0);
80     auto B =_mm_setr_pd(1.0, 2.0);
81     assert( _mm_hadd_pd(A, B).array ==_mm_setr_pd(3.5, 3.0).array );
82 }
83 
84 // PERF: for GDC, detect SSE3 and use the relevant builtin
85 /// Horizontally add adjacent pairs of single-precision (32-bit) 
86 /// floating-point elements in `a` and `b`.
87 __m128 _mm_hadd_ps (__m128 a, __m128 b) pure @trusted
88 {
89     static if (LDC_with_SSE3)
90     {
91         return __builtin_ia32_haddps(a, b);
92     }
93     else static if (LDC_with_ARM64)
94     {
95         return vpaddq_f32(a, b);
96     }
97     else
98     {    
99         __m128 res;
100         res.ptr[0] = a.array[1] + a.array[0];
101         res.ptr[1] = a.array[3] + a.array[2];
102         res.ptr[2] = b.array[1] + b.array[0];
103         res.ptr[3] = b.array[3] + b.array[2];
104         return res;
105     }
106 }
107 unittest
108 {
109     __m128 A =_mm_setr_ps(1.0f, 2.0f, 3.0f, 5.0f);
110     __m128 B =_mm_setr_ps(1.5f, 2.0f, 3.5f, 4.0f);
111     assert( _mm_hadd_ps(A, B).array == _mm_setr_ps(3.0f, 8.0f, 3.5f, 7.5f).array );
112 }
113 
114 /// Horizontally subtract adjacent pairs of double-precision (64-bit) 
115 /// floating-point elements in `a` and `b`.
116 __m128d _mm_hsub_pd (__m128d a, __m128d b) pure @trusted
117 {
118     static if (LDC_with_SSE3)
119     {
120         return __builtin_ia32_hsubpd(a, b);
121     }
122     else
123     {        
124         // On GDC this generates hsubpd with -O1
125         __m128d res;
126         res.ptr[0] = a.array[0] - a.array[1];
127         res.ptr[1] = b.array[0] - b.array[1];
128         return res;
129     }
130 }
131 unittest
132 {
133     auto A =_mm_setr_pd(1.5, 2.0);
134     auto B =_mm_setr_pd(1.0, 2.0);
135     assert( _mm_hsub_pd(A, B).array ==_mm_setr_pd(-0.5, -1.0).array );
136 }
137 
138 /// Horizontally subtract adjacent pairs of single-precision (32-bit) 
139 /// floating-point elements in `a` and `b`.
140 __m128 _mm_hsub_ps (__m128 a, __m128 b) pure @trusted
141 {
142     static if (LDC_with_SSE3)
143     {
144         return __builtin_ia32_hsubps(a, b);
145     }
146     else static if (LDC_with_ARM64)
147     {
148         int4 mask = [0, 0x80000000, 0, 0x80000000];
149         a = cast(__m128)(cast(int4)a ^ mask);
150         b = cast(__m128)(cast(int4)b ^ mask);
151         return vpaddq_f32(a, b);
152     }
153     else
154     {
155         // PERF: GDC doesn't generate the right instruction, do something
156         __m128 res;
157         res.ptr[0] = a.array[0] - a.array[1];
158         res.ptr[1] = a.array[2] - a.array[3];
159         res.ptr[2] = b.array[0] - b.array[1];
160         res.ptr[3] = b.array[2] - b.array[3];
161         return res;
162     }
163 }
164 unittest
165 {
166     __m128 A =_mm_setr_ps(1.0f, 2.0f, 3.0f, 5.0f);
167     __m128 B =_mm_setr_ps(1.5f, 2.0f, 3.5f, 4.0f);
168     assert(_mm_hsub_ps(A, B).array == _mm_setr_ps(-1.0f, -2.0f, -0.5f, -0.5f).array);
169 }
170 
171 /// Load 128-bits of integer data from unaligned memory.
172 // Note: The saying is LDDQU was only ever useful around 2008
173 // See_also: https://stackoverflow.com/questions/38370622/a-faster-integer-sse-unalligned-load-thats-rarely-used
174 alias _mm_lddqu_si128 = _mm_loadu_si128;
175 
176 /// Load a double-precision (64-bit) floating-point element from memory into both elements of result.
177 __m128d _mm_loaddup_pd (const(double)* mem_addr) pure @trusted
178 {
179     // Note: generates movddup since LDC 1.3 with -O1 -mattr=+sse3
180     // Same for GDC with -O1
181     double value = *mem_addr;
182     __m128d res;
183     res.ptr[0] = value;
184     res.ptr[1] = value;
185     return res;
186 }
187 unittest
188 {
189     version(LDC)
190     {
191         double a = 7.5;
192         assert(_mm_loaddup_pd(&a) == _mm_set_pd(7.5, 7.5));
193     }
194     else
195     {
196         double a = 7.5;
197         // For some reason, this line used to break with LDC, but not when isolated! Was never reported.
198         assert(_mm_loaddup_pd(&a).array == _mm_set_pd(7.5, 7.5).array);
199     }
200 }
201 
202 /// Duplicate the low double-precision (64-bit) floating-point element from `a`.
203 __m128d _mm_movedup_pd (__m128d a) pure @trusted
204 {
205     // Note: generates movddup since LDC 1.3 with -O1 -mattr=+sse3
206     // Something efficient with -01 for GDC
207     a.ptr[1] = a.array[0];
208     return a;
209 }
210 unittest
211 {
212     __m128d A = _mm_setr_pd(7.0, 2.5);
213     assert(_mm_movedup_pd(A).array == _mm_set_pd(7.0, 7.0).array);
214 }
215 
216 /// Duplicate odd-indexed single-precision (32-bit) floating-point elements from `a`.
217 __m128 _mm_movehdup_ps (__m128 a) pure @trusted
218 {
219     // Generates movshdup since LDC 1.3 with -O1 -mattr=+sse3
220     // PERF but GDC never generates it
221     a.ptr[0] = a.array[1];
222     a.ptr[2] = a.array[3];
223     return a;
224 }
225 unittest
226 {
227     __m128 A = _mm_movehdup_ps(_mm_setr_ps(1, 2, 3, 4));
228     float[4] correct = [2.0f, 2, 4, 4 ];
229     assert(A.array == correct);
230 }
231 
232 /// Duplicate even-indexed single-precision (32-bit) floating-point elements from `a`.
233 __m128 _mm_moveldup_ps (__m128 a) pure @trusted
234 {
235     // Generates movsldup since LDC 1.3 with -O1 -mattr=+sse3
236     // PERF but GDC never generates it
237     a.ptr[1] = a.array[0];
238     a.ptr[3] = a.array[2];
239     return a;
240 }
241 unittest
242 {
243     __m128 A = _mm_moveldup_ps(_mm_setr_ps(1, 2, 3, 4));
244     float[4] correct = [1.0f, 1, 3, 3 ];
245     assert(A.array == correct);
246 }