Plasma Engine  2.0
Loading...
Searching...
No Matches
NEONVec4f_inl.h
1#pragma once
2
3PL_ALWAYS_INLINE plSimdVec4f::plSimdVec4f()
4{
5 PL_CHECK_SIMD_ALIGNMENT(this);
6
7#if PL_ENABLED(PL_MATH_CHECK_FOR_NAN)
8 // Initialize all data to NaN in debug mode to find problems with uninitialized data easier.
9 m_v = vmovq_n_f32(plMath::NaN<float>());
10#endif
11}
12
13PL_ALWAYS_INLINE plSimdVec4f::plSimdVec4f(float xyzw)
14{
15 PL_CHECK_SIMD_ALIGNMENT(this);
16
17 m_v = vmovq_n_f32(xyzw);
18}
19
20PL_ALWAYS_INLINE plSimdVec4f::plSimdVec4f(const plSimdFloat& xyzw)
21{
22 PL_CHECK_SIMD_ALIGNMENT(this);
23
24 m_v = xyzw.m_v;
25}
26
27PL_ALWAYS_INLINE plSimdVec4f::plSimdVec4f(float x, float y, float z, float w)
28{
29 PL_CHECK_SIMD_ALIGNMENT(this);
30
31 alignas(16) float values[4] = {x, y, z, w};
32 m_v = vld1q_f32(values);
33}
34
35PL_ALWAYS_INLINE void plSimdVec4f::Set(float xyzw)
36{
37 m_v = vmovq_n_f32(xyzw);
38}
39
40PL_ALWAYS_INLINE void plSimdVec4f::Set(float x, float y, float z, float w)
41{
42 alignas(16) float values[4] = {x, y, z, w};
43 m_v = vld1q_f32(values);
44}
45
46PL_ALWAYS_INLINE void plSimdVec4f::SetX(const plSimdFloat& f)
47{
48 m_v = vsetq_lane_f32(f, m_v, 0);
49}
50
51PL_ALWAYS_INLINE void plSimdVec4f::SetY(const plSimdFloat& f)
52{
53 m_v = vsetq_lane_f32(f, m_v, 1);
54}
55
56PL_ALWAYS_INLINE void plSimdVec4f::SetZ(const plSimdFloat& f)
57{
58 m_v = vsetq_lane_f32(f, m_v, 2);
59}
60
61PL_ALWAYS_INLINE void plSimdVec4f::SetW(const plSimdFloat& f)
62{
63 m_v = vsetq_lane_f32(f, m_v, 3);
64}
65
66PL_ALWAYS_INLINE void plSimdVec4f::SetZero()
67{
68 m_v = vmovq_n_f32(0.0f);
69}
70
71template <>
72PL_ALWAYS_INLINE void plSimdVec4f::Load<1>(const float* pFloat)
73{
74 m_v = vld1q_lane_f32(pFloat, vmovq_n_f32(0.0f), 0);
75}
76
77template <>
78PL_ALWAYS_INLINE void plSimdVec4f::Load<2>(const float* pFloat)
79{
80 m_v = vreinterpretq_f32_f64(vld1q_lane_f64(reinterpret_cast<const float64_t*>(pFloat), vmovq_n_f64(0.0), 0));
81}
82
83template <>
84PL_ALWAYS_INLINE void plSimdVec4f::Load<3>(const float* pFloat)
85{
86 m_v = vcombine_f32(vld1_f32(pFloat), vld1_lane_f32(pFloat + 2, vmov_n_f32(0.0f), 0));
87}
88
89template <>
90PL_ALWAYS_INLINE void plSimdVec4f::Load<4>(const float* pFloat)
91{
92 m_v = vld1q_f32(pFloat);
93}
94
95template <>
96PL_ALWAYS_INLINE void plSimdVec4f::Store<1>(float* pFloat) const
97{
98 vst1q_lane_f32(pFloat, m_v, 0);
99}
100
101template <>
102PL_ALWAYS_INLINE void plSimdVec4f::Store<2>(float* pFloat) const
103{
104 vst1q_lane_f64(reinterpret_cast<float64_t*>(pFloat), vreinterpretq_f64_f32(m_v), 0);
105}
106
107template <>
108PL_ALWAYS_INLINE void plSimdVec4f::Store<3>(float* pFloat) const
109{
110 vst1q_lane_f64(reinterpret_cast<float64_t*>(pFloat), vreinterpretq_f64_f32(m_v), 0);
111 vst1q_lane_f32(pFloat + 2, m_v, 2);
112}
113
114template <>
115PL_ALWAYS_INLINE void plSimdVec4f::Store<4>(float* pFloat) const
116{
117 vst1q_f32(pFloat, m_v);
118}
119
120template <>
121PL_ALWAYS_INLINE plSimdVec4f plSimdVec4f::GetReciprocal<plMathAcc::BITS_12>() const
122{
123 float32x4_t x0 = vrecpeq_f32(m_v);
124
125 // One iteration of Newton-Raphson
126 float32x4_t x1 = vmulq_f32(vrecpsq_f32(m_v, x0), x0);
127
128 return x1;
129}
130
131template <>
132PL_ALWAYS_INLINE plSimdVec4f plSimdVec4f::GetReciprocal<plMathAcc::BITS_23>() const
133{
134 float32x4_t x0 = vrecpeq_f32(m_v);
135
136 // Two iterations of Newton-Raphson
137 float32x4_t x1 = vmulq_f32(vrecpsq_f32(m_v, x0), x0);
138 float32x4_t x2 = vmulq_f32(vrecpsq_f32(m_v, x1), x1);
139
140 return x2;
141}
142
143template <>
144PL_ALWAYS_INLINE plSimdVec4f plSimdVec4f::GetReciprocal<plMathAcc::FULL>() const
145{
146 return vdivq_f32(vmovq_n_f32(1.0f), m_v);
147}
148
149template <>
150PL_ALWAYS_INLINE plSimdVec4f plSimdVec4f::GetInvSqrt<plMathAcc::FULL>() const
151{
152 return vdivq_f32(vmovq_n_f32(1.0f), vsqrtq_f32(m_v));
153}
154
155template <>
156PL_ALWAYS_INLINE plSimdVec4f plSimdVec4f::GetInvSqrt<plMathAcc::BITS_23>() const
157{
158 const float32x4_t x0 = vrsqrteq_f32(m_v);
159
160 // Two iterations of Newton-Raphson
161 const float32x4_t x1 = vmulq_f32(vrsqrtsq_f32(vmulq_f32(x0, m_v), x0), x0);
162 return vmulq_f32(vrsqrtsq_f32(vmulq_f32(x1, m_v), x1), x1);
163}
164
165template <>
166PL_ALWAYS_INLINE plSimdVec4f plSimdVec4f::GetInvSqrt<plMathAcc::BITS_12>() const
167{
168 const float32x4_t x0 = vrsqrteq_f32(m_v);
169
170 // One iteration of Newton-Raphson
171 return vmulq_f32(vrsqrtsq_f32(vmulq_f32(x0, m_v), x0), x0);
172}
173
174template <>
175PL_ALWAYS_INLINE plSimdVec4f plSimdVec4f::GetSqrt<plMathAcc::BITS_12>() const
176{
177 return CompMul(GetInvSqrt<plMathAcc::BITS_12>());
178}
179
180template <>
181PL_ALWAYS_INLINE plSimdVec4f plSimdVec4f::GetSqrt<plMathAcc::BITS_23>() const
182{
183 return CompMul(GetInvSqrt<plMathAcc::BITS_23>());
184}
185
186template <>
187PL_ALWAYS_INLINE plSimdVec4f plSimdVec4f::GetSqrt<plMathAcc::FULL>() const
188{
189 return vsqrtq_f32(m_v);
190}
191
192template <int N, plMathAcc::Enum acc>
193void plSimdVec4f::NormalizeIfNotZero(const plSimdFloat& fEpsilon)
194{
195 plSimdFloat sqLength = GetLengthSquared<N>();
196 uint32x4_t isNotZero = vcgtq_f32(sqLength.m_v, fEpsilon.m_v);
197 m_v = vmulq_f32(m_v, sqLength.GetInvSqrt<acc>().m_v);
198 m_v = vreinterpretq_f32_u32(vandq_u32(isNotZero, vreinterpretq_u32_f32(m_v)));
199}
200
201template <int N>
202PL_ALWAYS_INLINE bool plSimdVec4f::IsZero() const
203{
204 const int mask = PL_BIT(N) - 1;
205 return (plInternal::NeonMoveMask(vceqzq_f32(m_v)) & mask) == mask;
206}
207
208template <int N>
209PL_ALWAYS_INLINE bool plSimdVec4f::IsZero(const plSimdFloat& fEpsilon) const
210{
211 const int mask = PL_BIT(N) - 1;
212 float32x4_t absVal = Abs().m_v;
213 return (plInternal::NeonMoveMask(vcltq_f32(absVal, fEpsilon.m_v)) & mask) == mask;
214}
215
216template <int N>
217inline bool plSimdVec4f::IsNaN() const
218{
219 const int mask = PL_BIT(N) - 1;
220 return (plInternal::NeonMoveMask(vceqq_f32(m_v, m_v)) & mask) != mask;
221}
222
223template <int N>
224PL_ALWAYS_INLINE bool plSimdVec4f::IsValid() const
225{
226 const int mask = PL_BIT(N) - 1;
227 return (plInternal::NeonMoveMask(vcgeq_u32(vreinterpretq_u32_f32(m_v), vmovq_n_u32(0x7f800000))) & mask) == 0;
228}
229
230template <int N>
231PL_ALWAYS_INLINE plSimdFloat plSimdVec4f::GetComponent() const
232{
233 return vdupq_laneq_f32(m_v, N);
234}
235
236PL_ALWAYS_INLINE plSimdFloat plSimdVec4f::x() const
237{
238 return GetComponent<0>();
239}
240
241PL_ALWAYS_INLINE plSimdFloat plSimdVec4f::y() const
242{
243 return GetComponent<1>();
244}
245
246PL_ALWAYS_INLINE plSimdFloat plSimdVec4f::z() const
247{
248 return GetComponent<2>();
249}
250
251PL_ALWAYS_INLINE plSimdFloat plSimdVec4f::w() const
252{
253 return GetComponent<3>();
254}
255
256template <plSwizzle::Enum s>
257PL_ALWAYS_INLINE plSimdVec4f plSimdVec4f::Get() const
258{
259 return __builtin_shufflevector(m_v, m_v, PL_TO_SHUFFLE(s));
260}
261
262template <plSwizzle::Enum s>
263PL_ALWAYS_INLINE plSimdVec4f plSimdVec4f::GetCombined(const plSimdVec4f& other) const
264{
265 return __builtin_shufflevector(m_v, other.m_v, PL_TO_SHUFFLE(s));
266}
267
268PL_ALWAYS_INLINE plSimdVec4f plSimdVec4f::operator-() const
269{
270 return vnegq_f32(m_v);
271}
272
273PL_ALWAYS_INLINE plSimdVec4f plSimdVec4f::operator+(const plSimdVec4f& v) const
274{
275 return vaddq_f32(m_v, v.m_v);
276}
277
278PL_ALWAYS_INLINE plSimdVec4f plSimdVec4f::operator-(const plSimdVec4f& v) const
279{
280 return vsubq_f32(m_v, v.m_v);
281}
282
283PL_ALWAYS_INLINE plSimdVec4f plSimdVec4f::operator*(const plSimdFloat& f) const
284{
285 return vmulq_f32(m_v, f.m_v);
286}
287
288PL_ALWAYS_INLINE plSimdVec4f plSimdVec4f::operator/(const plSimdFloat& f) const
289{
290 return vdivq_f32(m_v, f.m_v);
291}
292
293PL_ALWAYS_INLINE plSimdVec4f plSimdVec4f::CompMul(const plSimdVec4f& v) const
294{
295 return vmulq_f32(m_v, v.m_v);
296}
297
298template <>
299PL_ALWAYS_INLINE plSimdVec4f plSimdVec4f::CompDiv<plMathAcc::FULL>(const plSimdVec4f& v) const
300{
301 return vdivq_f32(m_v, v.m_v);
302}
303
304template <>
305PL_ALWAYS_INLINE plSimdVec4f plSimdVec4f::CompDiv<plMathAcc::BITS_23>(const plSimdVec4f& v) const
306{
307 return CompMul(v.GetReciprocal<plMathAcc::BITS_23>());
308}
309
310template <>
311PL_ALWAYS_INLINE plSimdVec4f plSimdVec4f::CompDiv<plMathAcc::BITS_12>(const plSimdVec4f& v) const
312{
313 return CompMul(v.GetReciprocal<plMathAcc::BITS_12>());
314}
315
316PL_ALWAYS_INLINE plSimdVec4f plSimdVec4f::CompMin(const plSimdVec4f& v) const
317{
318 return vminq_f32(m_v, v.m_v);
319}
320
321PL_ALWAYS_INLINE plSimdVec4f plSimdVec4f::CompMax(const plSimdVec4f& v) const
322{
323 return vmaxq_f32(m_v, v.m_v);
324}
325
326PL_ALWAYS_INLINE plSimdVec4f plSimdVec4f::Abs() const
327{
328 return vabsq_f32(m_v);
329}
330
331PL_ALWAYS_INLINE plSimdVec4f plSimdVec4f::Round() const
332{
333 return vrndnq_f32(m_v);
334}
335
336PL_ALWAYS_INLINE plSimdVec4f plSimdVec4f::Floor() const
337{
338 return vrndmq_f32(m_v);
339}
340
341PL_ALWAYS_INLINE plSimdVec4f plSimdVec4f::Ceil() const
342{
343 return vrndpq_f32(m_v);
344}
345
346PL_ALWAYS_INLINE plSimdVec4f plSimdVec4f::Trunc() const
347{
348 return vrndq_f32(m_v);
349}
350
351PL_ALWAYS_INLINE plSimdVec4f plSimdVec4f::FlipSign(const plSimdVec4b& cmp) const
352{
353 return vreinterpretq_f32_u32(veorq_u32(vreinterpretq_u32_f32(m_v), vshlq_n_u32(cmp.m_v, 31)));
354}
355
356// static
357PL_ALWAYS_INLINE plSimdVec4f plSimdVec4f::Select(const plSimdVec4b& cmp, const plSimdVec4f& ifTrue, const plSimdVec4f& ifFalse)
358{
359 return vbslq_f32(cmp.m_v, ifTrue.m_v, ifFalse.m_v);
360}
361
362PL_ALWAYS_INLINE plSimdVec4f& plSimdVec4f::operator+=(const plSimdVec4f& v)
363{
364 m_v = vaddq_f32(m_v, v.m_v);
365 return *this;
366}
367
368PL_ALWAYS_INLINE plSimdVec4f& plSimdVec4f::operator-=(const plSimdVec4f& v)
369{
370 m_v = vsubq_f32(m_v, v.m_v);
371 return *this;
372}
373
374PL_ALWAYS_INLINE plSimdVec4f& plSimdVec4f::operator*=(const plSimdFloat& f)
375{
376 m_v = vmulq_f32(m_v, f.m_v);
377 return *this;
378}
379
380PL_ALWAYS_INLINE plSimdVec4f& plSimdVec4f::operator/=(const plSimdFloat& f)
381{
382 m_v = vdivq_f32(m_v, f.m_v);
383 return *this;
384}
385
386PL_ALWAYS_INLINE plSimdVec4b plSimdVec4f::operator==(const plSimdVec4f& v) const
387{
388 return vceqq_f32(m_v, v.m_v);
389}
390
391PL_ALWAYS_INLINE plSimdVec4b plSimdVec4f::operator!=(const plSimdVec4f& v) const
392{
393 return vmvnq_u32(vceqq_f32(m_v, v.m_v));
394}
395
396PL_ALWAYS_INLINE plSimdVec4b plSimdVec4f::operator<=(const plSimdVec4f& v) const
397{
398 return vcleq_f32(m_v, v.m_v);
399}
400
401PL_ALWAYS_INLINE plSimdVec4b plSimdVec4f::operator<(const plSimdVec4f& v) const
402{
403 return vcltq_f32(m_v, v.m_v);
404}
405
406PL_ALWAYS_INLINE plSimdVec4b plSimdVec4f::operator>=(const plSimdVec4f& v) const
407{
408 return vcgeq_f32(m_v, v.m_v);
409}
410
411PL_ALWAYS_INLINE plSimdVec4b plSimdVec4f::operator>(const plSimdVec4f& v) const
412{
413 return vcgtq_f32(m_v, v.m_v);
414}
415
416template <>
417PL_ALWAYS_INLINE plSimdFloat plSimdVec4f::HorizontalSum<2>() const
418{
419 return vpadds_f32(vget_low_f32(m_v));
420}
421
422template <>
423PL_ALWAYS_INLINE plSimdFloat plSimdVec4f::HorizontalSum<3>() const
424{
425 return HorizontalSum<2>() + GetComponent<2>();
426}
427
428template <>
429PL_ALWAYS_INLINE plSimdFloat plSimdVec4f::HorizontalSum<4>() const
430{
431 float32x2_t x0 = vpadd_f32(vget_low_f32(m_v), vget_high_f32(m_v));
432 return vpadds_f32(x0);
433}
434
435template <>
436PL_ALWAYS_INLINE plSimdFloat plSimdVec4f::HorizontalMin<2>() const
437{
438 return vpmins_f32(vget_low_f32(m_v));
439}
440
441template <>
442PL_ALWAYS_INLINE plSimdFloat plSimdVec4f::HorizontalMin<3>() const
443{
444 return vminq_f32(vmovq_n_f32(vpmins_f32(vget_low_f32(m_v))), vdupq_laneq_f32(m_v, 2));
445}
446
447template <>
448PL_ALWAYS_INLINE plSimdFloat plSimdVec4f::HorizontalMin<4>() const
449{
450 return vpmins_f32(vpmin_f32(vget_low_f32(m_v), vget_high_f32(m_v)));
451}
452
453template <>
454PL_ALWAYS_INLINE plSimdFloat plSimdVec4f::HorizontalMax<2>() const
455{
456 return vpmaxs_f32(vget_low_f32(m_v));
457}
458
459template <>
460PL_ALWAYS_INLINE plSimdFloat plSimdVec4f::HorizontalMax<3>() const
461{
462 return vmaxq_f32(vmovq_n_f32(vpmaxs_f32(vget_low_f32(m_v))), vdupq_laneq_f32(m_v, 2));
463}
464
465template <>
466PL_ALWAYS_INLINE plSimdFloat plSimdVec4f::HorizontalMax<4>() const
467{
468 return vpmaxs_f32(vpmax_f32(vget_low_f32(m_v), vget_high_f32(m_v)));
469}
470
471template <>
472PL_ALWAYS_INLINE plSimdFloat plSimdVec4f::Dot<1>(const plSimdVec4f& v) const
473{
474 return vdupq_laneq_f32(vmulq_f32(m_v, v.m_v), 0);
475}
476
477template <>
478PL_ALWAYS_INLINE plSimdFloat plSimdVec4f::Dot<2>(const plSimdVec4f& v) const
479{
480 return vpadds_f32(vmul_f32(vget_low_f32(m_v), vget_low_f32(v.m_v)));
481}
482
483template <>
484PL_ALWAYS_INLINE plSimdFloat plSimdVec4f::Dot<3>(const plSimdVec4f& v) const
485{
486 return CompMul(v).HorizontalSum<3>();
487}
488
489template <>
490PL_ALWAYS_INLINE plSimdFloat plSimdVec4f::Dot<4>(const plSimdVec4f& v) const
491{
492 return CompMul(v).HorizontalSum<4>();
493}
494
495PL_ALWAYS_INLINE plSimdVec4f plSimdVec4f::CrossRH(const plSimdVec4f& v) const
496{
497 float32x4_t a = vmulq_f32(m_v, __builtin_shufflevector(v.m_v, v.m_v, PL_TO_SHUFFLE(plSwizzle::YZXW)));
498 float32x4_t b = vmulq_f32(v.m_v, __builtin_shufflevector(m_v, m_v, PL_TO_SHUFFLE(plSwizzle::YZXW)));
499 float32x4_t c = vsubq_f32(a, b);
500
501 return __builtin_shufflevector(c, c, PL_TO_SHUFFLE(plSwizzle::YZXW));
502}
503
504PL_ALWAYS_INLINE plSimdVec4f plSimdVec4f::GetOrthogonalVector() const
505{
506 // See http://blog.selfshadow.com/2011/10/17/perp-vectors/ - this is Stark's first variant, SIMDified.
507 return CrossRH(vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(m_v), vceqq_f32(m_v, HorizontalMin<3>().m_v))));
508}
509
510// static
511PL_ALWAYS_INLINE plSimdVec4f plSimdVec4f::MulAdd(const plSimdVec4f& a, const plSimdVec4f& b, const plSimdVec4f& c)
512{
513 return vfmaq_f32(c.m_v, a.m_v, b.m_v);
514}
515
516// static
517PL_ALWAYS_INLINE plSimdVec4f plSimdVec4f::MulAdd(const plSimdVec4f& a, const plSimdFloat& b, const plSimdVec4f& c)
518{
519 return vfmaq_f32(c.m_v, a.m_v, b.m_v);
520}
521
522// static
523PL_ALWAYS_INLINE plSimdVec4f plSimdVec4f::MulSub(const plSimdVec4f& a, const plSimdVec4f& b, const plSimdVec4f& c)
524{
525 return vnegq_f32(vfmsq_f32(c.m_v, a.m_v, b.m_v));
526}
527
528// static
529PL_ALWAYS_INLINE plSimdVec4f plSimdVec4f::MulSub(const plSimdVec4f& a, const plSimdFloat& b, const plSimdVec4f& c)
530{
531 return vnegq_f32(vfmsq_f32(c.m_v, a.m_v, b.m_v));
532}
533
534// static
535PL_ALWAYS_INLINE plSimdVec4f plSimdVec4f::CopySign(const plSimdVec4f& magnitude, const plSimdVec4f& sign)
536{
537 return vbslq_f32(vmovq_n_u32(0x80000000), sign.m_v, magnitude.m_v);
538}
Definition SimdFloat.h:7
Definition SimdVec4b.h:7
A 4-component SIMD vector class.
Definition SimdVec4f.h:8
plSimdVec4f CrossRH(const plSimdVec4f &v) const
3D cross product, w is ignored.
Definition FPUVec4f_inl.h:473
plSimdVec4f GetOrthogonalVector() const
Generates an arbitrary vector such that Dot<3>(GetOrthogonalVector()) == 0.
Definition FPUVec4f_inl.h:478
plSimdVec4f GetCombined(const plSimdVec4f &other) const
x = this[s0], y = this[s1], z = other[s2], w = other[s3]
constexpr TYPE NaN()
Returns the value for NaN as the template type. Returns zero, if the type does not support NaN.
Definition Constants_inl.h:58