Planet
navi homePPSaboutscreenshotsdownloaddevelopmentforum

source: downloads/OgreMain/src/OgreSIMDHelper.h @ 1

Last change on this file since 1 was 1, checked in by landauf, 17 years ago
File size: 19.5 KB
Line 
1/*
2-----------------------------------------------------------------------------
3This source file is part of OGRE
4    (Object-oriented Graphics Rendering Engine)
5For the latest info, see http://www.ogre3d.org/
6
7Copyright (c) 2000-2006 Torus Knot Software Ltd
8Also see acknowledgements in Readme.html
9
10This program is free software; you can redistribute it and/or modify it under
11the terms of the GNU Lesser General Public License as published by the Free Software
12Foundation; either version 2 of the License, or (at your option) any later
13version.
14
15This program is distributed in the hope that it will be useful, but WITHOUT
16ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
17FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details.
18
19You should have received a copy of the GNU Lesser General Public License along with
20this program; if not, write to the Free Software Foundation, Inc., 59 Temple
21Place - Suite 330, Boston, MA 02111-1307, USA, or go to
22http://www.gnu.org/copyleft/lesser.txt.
23
24You may alternatively use this source under the terms of a specific version of
25the OGRE Unrestricted License provided you have obtained such a license from
26Torus Knot Software Ltd.
27-----------------------------------------------------------------------------
28*/
29#ifndef __SIMDHelper_H__
30#define __SIMDHelper_H__
31
32#include "OgrePrerequisites.h"
33#include "OgrePlatformInformation.h"
34
35// Stack-alignment hackery.
36//
37// If macro __OGRE_SIMD_ALIGN_STACK defined, means there requests
38// special code to ensure stack align to a 16-bytes boundary.
39//
40// Note:
41//   This macro can only guarantee callee stack pointer (esp) align
42// to a 16-bytes boundary, but not that for frame pointer (ebp).
43// Because most compiler might use frame pointer to access to stack
44// variables, so you need to wrap those alignment required functions
45// with extra function call.
46//
47#if defined(__INTEL_COMPILER)
48// For intel's compiler, simply calling alloca seems to do the right
49// thing. The size of the allocated block seems to be irrelevant.
50#define __OGRE_SIMD_ALIGN_STACK()   _alloca(16)
51
52#elif OGRE_CPU == OGRE_CPU_X86 && OGRE_COMPILER == OGRE_COMPILER_GNUC
53//
54// Horrible hack to align the stack to a 16-bytes boundary for gcc.
55//
56// We assume a gcc version >= 2.95 so that
57// -mpreferred-stack-boundary works.  Otherwise, all bets are
58// off.  However, -mpreferred-stack-boundary does not create a
59// stack alignment, but it only preserves it.  Unfortunately,
60// since Ogre are designed as a flexibility library, user might
61// compile their application with wrong stack alignment, even
62// if user taken care with stack alignment, but many versions
63// of libc on linux call main() with the wrong initial stack
64// alignment the result that the code is now pessimally aligned
65// instead of having a 50% chance of being correct.
66//
67#define __OGRE_SIMD_ALIGN_STACK()                                   \
68    {                                                               \
69        /* Use alloca to allocate some memory on the stack.  */     \
70        /* This alerts gcc that something funny is going on, */     \
71        /* so that it does not omit the frame pointer etc.   */     \
72        (void)__builtin_alloca(16);                                 \
73        /* Now align the stack pointer */                           \
74        __asm__ __volatile__ ("andl $-16, %esp");                   \
75    }
76
77#elif defined(_MSC_VER)
78// Fortunately, MSVC will align the stack automatically
79
80#endif
81
82
83// Additional platform-dependent header files and declares.
84//
85// NOTE: Should be sync with __OGRE_HAVE_SSE macro.
86//
87
88#if OGRE_DOUBLE_PRECISION == 0 && OGRE_CPU == OGRE_CPU_X86
89
90#if OGRE_COMPILER == OGRE_COMPILER_MSVC || defined(__INTEL_COMPILER)
91#include "OgreNoMemoryMacros.h"
92#include <xmmintrin.h>
93#include "OgreMemoryMacros.h"
94
95#elif OGRE_COMPILER == OGRE_COMPILER_GNUC
96// Don't define ourself version SSE intrinsics if "xmmintrin.h" already included.
97//
98// Note: gcc in some platform already included "xmmintrin.h" for some reason.
99// I pick up macro _XMMINTRIN_H_INCLUDED here which based on the "xmmintrin.h"
100// comes with cygwin gcc 3.4.4, guess it should be solved duplicate definition
101// problem on gcc for x86.
102//
103#if !defined(_XMMINTRIN_H_INCLUDED)
104
105// Simulate VC/ICC intrinsics. Only used intrinsics are declared here.
106
107typedef float __m128 __attribute__ ((mode(V4SF),aligned(16)));
108typedef int __m64 __attribute__ ((mode(V2SI)));
109
110// Macro for declare intrinsic routines always inline even if in debug build
111#define __ALWAYS_INLINE    FORCEINLINE __attribute__ ((__always_inline__))
112
113// Shuffle instruction must be declare as macro
114
115#define _MM_SHUFFLE(fp3,fp2,fp1,fp0) \
116    (((fp3) << 6) | ((fp2) << 4) | ((fp1) << 2) | ((fp0)))
117
118#define _mm_shuffle_ps(a, b, imm8) __extension__                                        \
119    ({                                                                                              \
120        __m128 result;                                                                                              \
121        __asm__("shufps %3, %2, %0" : "=x" (result) : "0" (a), "xm" (b), "N" (imm8));   \
122        result;                                                                                                     \
123    })
124
125
126// Load/store instructions
127
128#define __MM_DECL_LD(name, instruction, type)                               \
129    static __ALWAYS_INLINE __m128 _mm_##name(const type *addr)              \
130    {                                                                       \
131        __m128 result;                                                      \
132        __asm__( #instruction " %1, %0" : "=x" (result) : "m" (*addr));     \
133        return result;                                                      \
134    }
135
136#define __MM_DECL_LD2(name, instruction, type)                                      \
137    static __ALWAYS_INLINE __m128 _mm_##name(__m128 val, const type *addr)          \
138    {                                                                               \
139        __m128 result;                                                              \
140        __asm__( #instruction " %2, %0" : "=x" (result) : "0"(val), "m" (*addr));   \
141        return result;                                                              \
142    }
143
144#define __MM_DECL_ST(name, instruction, type)                               \
145    static __ALWAYS_INLINE void _mm_##name(type *addr, __m128 val)          \
146    {                                                                       \
147        __asm__( #instruction " %1, %0" : "=m" (*addr) : "x" (val));        \
148    }
149
150__MM_DECL_LD(loadu_ps, movups, float)
151__MM_DECL_ST(storeu_ps, movups, float)
152
153__MM_DECL_LD(load_ss, movss, float)
154__MM_DECL_ST(store_ss, movss, float)
155
156__MM_DECL_ST(storel_pi, movlps, __m64)
157__MM_DECL_ST(storeh_pi, movhps, __m64)
158__MM_DECL_LD2(loadl_pi, movlps, __m64)
159__MM_DECL_LD2(loadh_pi, movhps, __m64)
160
161#undef __MM_DECL_LD
162#undef __MM_DECL_LD2
163#undef __MM_DECL_ST
164
165// Two operand instructions
166
167#define __MM_DECL_OP2(name, instruction, constraint)                                    \
168    static __ALWAYS_INLINE __m128 _mm_##name(__m128 a, __m128 b)                        \
169    {                                                                                   \
170        __m128 result;                                                                  \
171        __asm__( #instruction " %2, %0" : "=x" (result) : "0" (a), #constraint (b));    \
172        return result;                                                                  \
173    }
174
175__MM_DECL_OP2(add_ps, addps, xm)
176__MM_DECL_OP2(add_ss, addss, xm)
177__MM_DECL_OP2(sub_ps, subps, xm)
178__MM_DECL_OP2(sub_ss, subss, xm)
179__MM_DECL_OP2(mul_ps, mulps, xm)
180__MM_DECL_OP2(mul_ss, mulss, xm)
181
182__MM_DECL_OP2(xor_ps, xorps, xm)
183
184__MM_DECL_OP2(unpacklo_ps, unpcklps, xm)
185__MM_DECL_OP2(unpackhi_ps, unpckhps, xm)
186
187__MM_DECL_OP2(movehl_ps, movhlps, x)
188__MM_DECL_OP2(movelh_ps, movlhps, x)
189
190__MM_DECL_OP2(cmpnle_ps, cmpnleps, xm)
191
192#undef __MM_DECL_OP2
193
194// Other used instructions
195
196    static __ALWAYS_INLINE __m128 _mm_load_ps1(const float *addr)
197    {
198        __m128 tmp = _mm_load_ss(addr);
199        return _mm_shuffle_ps(tmp, tmp, 0);
200    }
201
202    static __ALWAYS_INLINE __m128 _mm_setzero_ps(void)
203    {
204        __m128 result;
205        __asm__("xorps %0, %0" : "=x" (result));
206        return result;
207    }
208
209    static __ALWAYS_INLINE __m128 _mm_rsqrt_ps(__m128 val)
210    {
211        __m128 result;
212        __asm__("rsqrtps %1, %0" : "=x" (result) : "xm" (val));
213        //__asm__("rsqrtps %0, %0" : "=x" (result) : "0" (val));
214        return result;
215    }
216
217    static __ALWAYS_INLINE int _mm_movemask_ps(__m128 val)
218    {
219        int result;
220        __asm__("movmskps %1, %0" : "=r" (result) : "x" (val));
221        return result;
222    }
223
224#endif // !defined(_XMMINTRIN_H_INCLUDED)
225
226#endif // OGRE_COMPILER == OGRE_COMPILER_GNUC
227
228#endif // OGRE_DOUBLE_PRECISION == 0 && OGRE_CPU == OGRE_CPU_X86
229
230
231
232//---------------------------------------------------------------------
233// SIMD macros and helpers
234//---------------------------------------------------------------------
235
236
237namespace Ogre {
238
239#if __OGRE_HAVE_SSE
240
241/** Macro __MM_RSQRT_PS calculate square root, which should be used for
242    normalise normals only. It might be use NewtonRaphson reciprocal square
243    root for high precision, or use SSE rsqrt instruction directly, based
244    on profile to pick up perfect one.
245@note:
246    Prefer to never use NewtonRaphson reciprocal square root at all, since
247    speed test indicate performance loss 10% for unrolled version, and loss
248    %25 for general version (P4 3.0G HT). A slight loss in precision not
249    that important in case of normalise normals.
250*/
251#if 1
252#define __MM_RSQRT_PS(x)    _mm_rsqrt_ps(x)
253#else
254#define __MM_RSQRT_PS(x)    __mm_rsqrt_nr_ps(x) // Implemented below
255#endif
256
257/** Performing the transpose of a 4x4 matrix of single precision floating
258    point values.
259    Arguments r0, r1, r2, and r3 are __m128 values whose elements
260    form the corresponding rows of a 4x4 matrix.
261    The matrix transpose is returned in arguments r0, r1, r2, and
262    r3 where r0 now holds column 0 of the original matrix, r1 now
263    holds column 1 of the original matrix, etc.
264*/
265#define __MM_TRANSPOSE4x4_PS(r0, r1, r2, r3)                                        \
266    {                                                                               \
267        __m128 t3, t2, t1, t0;                                                      \
268                                                                                    \
269                                                            /* r00 r01 r02 r03 */   \
270                                                            /* r10 r11 r12 r13 */   \
271                                                            /* r20 r21 r22 r23 */   \
272                                                            /* r30 r31 r32 r33 */   \
273                                                                                    \
274        t0 = _mm_unpacklo_ps(r0, r1);                       /* r00 r10 r01 r11 */   \
275        t2 = _mm_unpackhi_ps(r0, r1);                       /* r02 r12 r03 r13 */   \
276        t1 = _mm_unpacklo_ps(r2, r3);                       /* r20 r30 r21 r31 */   \
277        t3 = _mm_unpackhi_ps(r2, r3);                       /* r22 r32 r23 r33 */   \
278                                                                                    \
279        r0 = _mm_movelh_ps(t0, t1);                         /* r00 r10 r20 r30 */   \
280        r1 = _mm_movehl_ps(t1, t0);                         /* r01 r11 r21 r31 */   \
281        r2 = _mm_movelh_ps(t2, t3);                         /* r02 r12 r22 r32 */   \
282        r3 = _mm_movehl_ps(t3, t2);                         /* r03 r13 r23 r33 */   \
283    }
284
285/** Performing the transpose of a continuous stored rows of a 4x3 matrix to
286    a 3x4 matrix of single precision floating point values.
287    Arguments v0, v1, and v2 are __m128 values whose elements form the
288    corresponding continuous stored rows of a 4x3 matrix.
289    The matrix transpose is returned in arguments v0, v1, and v2, where
290    v0 now holds column 0 of the original matrix, v1 now holds column 1
291    of the original matrix, etc.
292*/
293#define __MM_TRANSPOSE4x3_PS(v0, v1, v2)                                            \
294    {                                                                               \
295        __m128 t0, t1, t2;                                                          \
296                                                                                    \
297                                                            /* r00 r01 r02 r10 */   \
298                                                            /* r11 r12 r20 r21 */   \
299                                                            /* r22 r30 r31 r32 */   \
300                                                                                    \
301        t0 = _mm_shuffle_ps(v0, v2, _MM_SHUFFLE(3,0,3,0));  /* r00 r10 r22 r32 */   \
302        t1 = _mm_shuffle_ps(v0, v1, _MM_SHUFFLE(1,0,2,1));  /* r01 r02 r11 r12 */   \
303        t2 = _mm_shuffle_ps(v1, v2, _MM_SHUFFLE(2,1,3,2));  /* r20 r21 r30 r31 */   \
304                                                                                    \
305        v0 = _mm_shuffle_ps(t0, t2, _MM_SHUFFLE(2,0,1,0));  /* r00 r10 r20 r30 */   \
306        v1 = _mm_shuffle_ps(t1, t2, _MM_SHUFFLE(3,1,2,0));  /* r01 r11 r21 r31 */   \
307        v2 = _mm_shuffle_ps(t1, t0, _MM_SHUFFLE(3,2,3,1));  /* r02 r12 r22 r32 */   \
308    }
309
310/** Performing the transpose of a 3x4 matrix to a continuous stored rows of
311    a 4x3 matrix of single precision floating point values.
312    Arguments v0, v1, and v2 are __m128 values whose elements form the
313    corresponding columns of a 3x4 matrix.
314    The matrix transpose is returned in arguments v0, v1, and v2, as a
315    continuous stored rows of a 4x3 matrix.
316*/
317#define __MM_TRANSPOSE3x4_PS(v0, v1, v2)                                            \
318    {                                                                               \
319        __m128 t0, t1, t2;                                                          \
320                                                                                    \
321                                                            /* r00 r10 r20 r30 */   \
322                                                            /* r01 r11 r21 r31 */   \
323                                                            /* r02 r12 r22 r32 */   \
324                                                                                    \
325        t0 = _mm_shuffle_ps(v0, v2, _MM_SHUFFLE(2,0,3,1));  /* r10 r30 r02 r22 */   \
326        t1 = _mm_shuffle_ps(v1, v2, _MM_SHUFFLE(3,1,3,1));  /* r11 r31 r12 r32 */   \
327        t2 = _mm_shuffle_ps(v0, v1, _MM_SHUFFLE(2,0,2,0));  /* r00 r20 r01 r21 */   \
328                                                                                    \
329        v0 = _mm_shuffle_ps(t2, t0, _MM_SHUFFLE(0,2,2,0));  /* r00 r01 r02 r10 */   \
330        v1 = _mm_shuffle_ps(t1, t2, _MM_SHUFFLE(3,1,2,0));  /* r11 r12 r20 r21 */   \
331        v2 = _mm_shuffle_ps(t0, t1, _MM_SHUFFLE(3,1,1,3));  /* r22 r30 r31 r32 */   \
332    }
333
334/** Fill vector of single precision floating point with selected value.
335    Argument 'fp' is a digit[0123] that represents the fp of argument 'v'.
336*/
337#define __MM_SELECT(v, fp)                                                          \
338    _mm_shuffle_ps((v), (v), _MM_SHUFFLE((fp),(fp),(fp),(fp)))
339
340/// Accumulate four vector of single precision floating point values.
341#define __MM_ACCUM4_PS(a, b, c, d)                                                  \
342    _mm_add_ps(_mm_add_ps(a, b), _mm_add_ps(c, d))
343
344/** Performing dot-product between two of four vector of single precision
345    floating point values.
346*/
347#define __MM_DOT4x4_PS(a0, a1, a2, a3, b0, b1, b2, b3)                              \
348    __MM_ACCUM4_PS(_mm_mul_ps(a0, b0), _mm_mul_ps(a1, b1), _mm_mul_ps(a2, b2), _mm_mul_ps(a3, b3))
349
350/** Performing dot-product between four vector and three vector of single
351    precision floating point values.
352*/
353#define __MM_DOT4x3_PS(r0, r1, r2, r3, v0, v1, v2)                                  \
354    __MM_ACCUM4_PS(_mm_mul_ps(r0, v0), _mm_mul_ps(r1, v1), _mm_mul_ps(r2, v2), r3)
355
356/// Accumulate three vector of single precision floating point values.
357#define __MM_ACCUM3_PS(a, b, c)                                                     \
358    _mm_add_ps(_mm_add_ps(a, b), c)
359
360/** Performing dot-product between two of three vector of single precision
361    floating point values.
362*/
363#define __MM_DOT3x3_PS(r0, r1, r2, v0, v1, v2)                                      \
364    __MM_ACCUM3_PS(_mm_mul_ps(r0, v0), _mm_mul_ps(r1, v1), _mm_mul_ps(r2, v2))
365
366/// Calculate multiply of two vector and plus another vector
367#define __MM_MADD_PS(a, b, c)                                                       \
368    _mm_add_ps(_mm_mul_ps(a, b), c)
369
370/// Linear interpolation
371#define __MM_LERP_PS(t, a, b)                                                       \
372    __MM_MADD_PS(_mm_sub_ps(b, a), t, a)
373
374/// Calculate multiply of two single floating value and plus another floating value
375#define __MM_MADD_SS(a, b, c)                                                       \
376    _mm_add_ss(_mm_mul_ss(a, b), c)
377
378/// Linear interpolation
379#define __MM_LERP_SS(t, a, b)                                                       \
380    __MM_MADD_SS(_mm_sub_ss(b, a), t, a)
381
382/// Same as _mm_load_ps, but can help VC generate more optimised code.
383#define __MM_LOAD_PS(p)                                                             \
384    (*(__m128*)(p))
385
386/// Same as _mm_store_ps, but can help VC generate more optimised code.
387#define __MM_STORE_PS(p, v)                                                         \
388    (*(__m128*)(p) = (v))
389
390
391    /** Helper to load/store SSE data based on whether or not aligned.
392    */
393    template <bool aligned = false>
394    struct SSEMemoryAccessor
395    {
396        static FORCEINLINE __m128 load(const float *p)
397        {
398            return _mm_loadu_ps(p);
399        }
400        static FORCEINLINE void store(float *p, const __m128& v)
401        {
402            _mm_storeu_ps(p, v);
403        }
404    };
405    // Special aligned accessor
406    template <>
407    struct SSEMemoryAccessor<true>
408    {
409        static FORCEINLINE const __m128& load(const float *p)
410        {
411            return __MM_LOAD_PS(p);
412        }
413        static FORCEINLINE void store(float *p, const __m128& v)
414        {
415            __MM_STORE_PS(p, v);
416        }
417    };
418
419    /** Check whether or not the given pointer perfect aligned for SSE.
420    */
421    static FORCEINLINE bool _isAlignedForSSE(const void *p)
422    {
423        return (((size_t)p) & 15) == 0;
424    }
425
426    /** Calculate NewtonRaphson Reciprocal Square Root with formula:
427            0.5 * rsqrt(x) * (3 - x * rsqrt(x)^2)
428    */
429    static FORCEINLINE __m128 __mm_rsqrt_nr_ps(const __m128& x)
430    {
431        static const __m128 v0pt5 = { 0.5f, 0.5f, 0.5f, 0.5f };
432        static const __m128 v3pt0 = { 3.0f, 3.0f, 3.0f, 3.0f };
433        __m128 t = _mm_rsqrt_ps(x);
434        return _mm_mul_ps(_mm_mul_ps(v0pt5, t),
435            _mm_sub_ps(v3pt0, _mm_mul_ps(_mm_mul_ps(x, t), t)));
436    }
437
438// Macro to check the stack aligned for SSE
439#if OGRE_DEBUG_MODE
440#define __OGRE_CHECK_STACK_ALIGNED_FOR_SSE()        \
441    {                                               \
442        __m128 test;                                \
443        assert(_isAlignedForSSE(&test));            \
444    }
445
446#else   // !OGRE_DEBUG_MODE
447#define __OGRE_CHECK_STACK_ALIGNED_FOR_SSE()
448
449#endif  // OGRE_DEBUG_MODE
450
451
452#endif  // __OGRE_HAVE_SSE
453
454}
455
456#endif // __SIMDHelper_H__
Note: See TracBrowser for help on using the repository browser.