Planet

navi

home

PPS

about

screenshots

download

development

forum

Context Navigation

source: downloads/OgreMain/src/OgreSIMDHelper.h @ 1

Last change on this file since 1 was 1, checked in by landauf, 17 years ago

File size: 19.5 KB

Line
1	/*
2	-----------------------------------------------------------------------------
3	This source file is part of OGRE
4	(Object-oriented Graphics Rendering Engine)
5	For the latest info, see http://www.ogre3d.org/
6
7	Copyright (c) 2000-2006 Torus Knot Software Ltd
8	Also see acknowledgements in Readme.html
9
10	This program is free software; you can redistribute it and/or modify it under
11	the terms of the GNU Lesser General Public License as published by the Free Software
12	Foundation; either version 2 of the License, or (at your option) any later
13	version.
14
15	This program is distributed in the hope that it will be useful, but WITHOUT
16	ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
17	FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details.
18
19	You should have received a copy of the GNU Lesser General Public License along with
20	this program; if not, write to the Free Software Foundation, Inc., 59 Temple
21	Place - Suite 330, Boston, MA 02111-1307, USA, or go to
22	http://www.gnu.org/copyleft/lesser.txt.
23
24	You may alternatively use this source under the terms of a specific version of
25	the OGRE Unrestricted License provided you have obtained such a license from
26	Torus Knot Software Ltd.
27	-----------------------------------------------------------------------------
28	*/
29	#ifndef __SIMDHelper_H__
30	#define __SIMDHelper_H__
31
32	#include "OgrePrerequisites.h"
33	#include "OgrePlatformInformation.h"
34
35	// Stack-alignment hackery.
36	//
37	// If macro __OGRE_SIMD_ALIGN_STACK defined, means there requests
38	// special code to ensure stack align to a 16-bytes boundary.
39	//
40	// Note:
41	// This macro can only guarantee callee stack pointer (esp) align
42	// to a 16-bytes boundary, but not that for frame pointer (ebp).
43	// Because most compiler might use frame pointer to access to stack
44	// variables, so you need to wrap those alignment required functions
45	// with extra function call.
46	//
47	#if defined(__INTEL_COMPILER)
48	// For intel's compiler, simply calling alloca seems to do the right
49	// thing. The size of the allocated block seems to be irrelevant.
50	#define __OGRE_SIMD_ALIGN_STACK() _alloca(16)
51
52	#elif OGRE_CPU == OGRE_CPU_X86 && OGRE_COMPILER == OGRE_COMPILER_GNUC
53	//
54	// Horrible hack to align the stack to a 16-bytes boundary for gcc.
55	//
56	// We assume a gcc version >= 2.95 so that
57	// -mpreferred-stack-boundary works. Otherwise, all bets are
58	// off. However, -mpreferred-stack-boundary does not create a
59	// stack alignment, but it only preserves it. Unfortunately,
60	// since Ogre are designed as a flexibility library, user might
61	// compile their application with wrong stack alignment, even
62	// if user taken care with stack alignment, but many versions
63	// of libc on linux call main() with the wrong initial stack
64	// alignment the result that the code is now pessimally aligned
65	// instead of having a 50% chance of being correct.
66	//
67	#define __OGRE_SIMD_ALIGN_STACK() \
68	{ \
69	/* Use alloca to allocate some memory on the stack. */ \
70	/* This alerts gcc that something funny is going on, */ \
71	/* so that it does not omit the frame pointer etc. */ \
72	(void)__builtin_alloca(16); \
73	/* Now align the stack pointer */ \
74	__asm__ __volatile__ ("andl $-16, %esp"); \
75	}
76
77	#elif defined(_MSC_VER)
78	// Fortunately, MSVC will align the stack automatically
79
80	#endif
81
82
83	// Additional platform-dependent header files and declares.
84	//
85	// NOTE: Should be sync with __OGRE_HAVE_SSE macro.
86	//
87
88	#if OGRE_DOUBLE_PRECISION == 0 && OGRE_CPU == OGRE_CPU_X86
89
90	#if OGRE_COMPILER == OGRE_COMPILER_MSVC \|\| defined(__INTEL_COMPILER)
91	#include "OgreNoMemoryMacros.h"
92	#include <xmmintrin.h>
93	#include "OgreMemoryMacros.h"
94
95	#elif OGRE_COMPILER == OGRE_COMPILER_GNUC
96	// Don't define ourself version SSE intrinsics if "xmmintrin.h" already included.
97	//
98	// Note: gcc in some platform already included "xmmintrin.h" for some reason.
99	// I pick up macro _XMMINTRIN_H_INCLUDED here which based on the "xmmintrin.h"
100	// comes with cygwin gcc 3.4.4, guess it should be solved duplicate definition
101	// problem on gcc for x86.
102	//
103	#if !defined(_XMMINTRIN_H_INCLUDED)
104
105	// Simulate VC/ICC intrinsics. Only used intrinsics are declared here.
106
107	typedef float __m128 __attribute__ ((mode(V4SF),aligned(16)));
108	typedef int __m64 __attribute__ ((mode(V2SI)));
109
110	// Macro for declare intrinsic routines always inline even if in debug build
111	#define __ALWAYS_INLINE FORCEINLINE __attribute__ ((__always_inline__))
112
113	// Shuffle instruction must be declare as macro
114
115	#define _MM_SHUFFLE(fp3,fp2,fp1,fp0) \
116	(((fp3) << 6) \| ((fp2) << 4) \| ((fp1) << 2) \| ((fp0)))
117
118	#define _mm_shuffle_ps(a, b, imm8) __extension__ \
119	({ \
120	__m128 result; \
121	__asm__("shufps %3, %2, %0" : "=x" (result) : "0" (a), "xm" (b), "N" (imm8)); \
122	result; \
123	})
124
125
126	// Load/store instructions
127
128	#define __MM_DECL_LD(name, instruction, type) \
129	static __ALWAYS_INLINE __m128 _mm_##name(const type *addr) \
130	{ \
131	__m128 result; \
132	__asm__( #instruction " %1, %0" : "=x" (result) : "m" (*addr)); \
133	return result; \
134	}
135
136	#define __MM_DECL_LD2(name, instruction, type) \
137	static __ALWAYS_INLINE __m128 _mm_##name(__m128 val, const type *addr) \
138	{ \
139	__m128 result; \
140	__asm__( #instruction " %2, %0" : "=x" (result) : "0"(val), "m" (*addr)); \
141	return result; \
142	}
143
144	#define __MM_DECL_ST(name, instruction, type) \
145	static __ALWAYS_INLINE void _mm_##name(type *addr, __m128 val) \
146	{ \
147	__asm__( #instruction " %1, %0" : "=m" (*addr) : "x" (val)); \
148	}
149
150	__MM_DECL_LD(loadu_ps, movups, float)
151	__MM_DECL_ST(storeu_ps, movups, float)
152
153	__MM_DECL_LD(load_ss, movss, float)
154	__MM_DECL_ST(store_ss, movss, float)
155
156	__MM_DECL_ST(storel_pi, movlps, __m64)
157	__MM_DECL_ST(storeh_pi, movhps, __m64)
158	__MM_DECL_LD2(loadl_pi, movlps, __m64)
159	__MM_DECL_LD2(loadh_pi, movhps, __m64)
160
161	#undef __MM_DECL_LD
162	#undef __MM_DECL_LD2
163	#undef __MM_DECL_ST
164
165	// Two operand instructions
166
167	#define __MM_DECL_OP2(name, instruction, constraint) \
168	static __ALWAYS_INLINE __m128 _mm_##name(__m128 a, __m128 b) \
169	{ \
170	__m128 result; \
171	__asm__( #instruction " %2, %0" : "=x" (result) : "0" (a), #constraint (b)); \
172	return result; \
173	}
174
175	__MM_DECL_OP2(add_ps, addps, xm)
176	__MM_DECL_OP2(add_ss, addss, xm)
177	__MM_DECL_OP2(sub_ps, subps, xm)
178	__MM_DECL_OP2(sub_ss, subss, xm)
179	__MM_DECL_OP2(mul_ps, mulps, xm)
180	__MM_DECL_OP2(mul_ss, mulss, xm)
181
182	__MM_DECL_OP2(xor_ps, xorps, xm)
183
184	__MM_DECL_OP2(unpacklo_ps, unpcklps, xm)
185	__MM_DECL_OP2(unpackhi_ps, unpckhps, xm)
186
187	__MM_DECL_OP2(movehl_ps, movhlps, x)
188	__MM_DECL_OP2(movelh_ps, movlhps, x)
189
190	__MM_DECL_OP2(cmpnle_ps, cmpnleps, xm)
191
192	#undef __MM_DECL_OP2
193
194	// Other used instructions
195
196	static __ALWAYS_INLINE __m128 _mm_load_ps1(const float *addr)
197	{
198	__m128 tmp = _mm_load_ss(addr);
199	return _mm_shuffle_ps(tmp, tmp, 0);
200	}
201
202	static __ALWAYS_INLINE __m128 _mm_setzero_ps(void)
203	{
204	__m128 result;
205	__asm__("xorps %0, %0" : "=x" (result));
206	return result;
207	}
208
209	static __ALWAYS_INLINE __m128 _mm_rsqrt_ps(__m128 val)
210	{
211	__m128 result;
212	__asm__("rsqrtps %1, %0" : "=x" (result) : "xm" (val));
213	//__asm__("rsqrtps %0, %0" : "=x" (result) : "0" (val));
214	return result;
215	}
216
217	static __ALWAYS_INLINE int _mm_movemask_ps(__m128 val)
218	{
219	int result;
220	__asm__("movmskps %1, %0" : "=r" (result) : "x" (val));
221	return result;
222	}
223
224	#endif // !defined(_XMMINTRIN_H_INCLUDED)
225
226	#endif // OGRE_COMPILER == OGRE_COMPILER_GNUC
227
228	#endif // OGRE_DOUBLE_PRECISION == 0 && OGRE_CPU == OGRE_CPU_X86
229
230
231
232	//---------------------------------------------------------------------
233	// SIMD macros and helpers
234	//---------------------------------------------------------------------
235
236
237	namespace Ogre {
238
239	#if __OGRE_HAVE_SSE
240
241	/** Macro __MM_RSQRT_PS calculate square root, which should be used for
242	normalise normals only. It might be use NewtonRaphson reciprocal square
243	root for high precision, or use SSE rsqrt instruction directly, based
244	on profile to pick up perfect one.
245	@note:
246	Prefer to never use NewtonRaphson reciprocal square root at all, since
247	speed test indicate performance loss 10% for unrolled version, and loss
248	%25 for general version (P4 3.0G HT). A slight loss in precision not
249	that important in case of normalise normals.
250	*/
251	#if 1
252	#define __MM_RSQRT_PS(x) _mm_rsqrt_ps(x)
253	#else
254	#define __MM_RSQRT_PS(x) __mm_rsqrt_nr_ps(x) // Implemented below
255	#endif
256
257	/** Performing the transpose of a 4x4 matrix of single precision floating
258	point values.
259	Arguments r0, r1, r2, and r3 are __m128 values whose elements
260	form the corresponding rows of a 4x4 matrix.
261	The matrix transpose is returned in arguments r0, r1, r2, and
262	r3 where r0 now holds column 0 of the original matrix, r1 now
263	holds column 1 of the original matrix, etc.
264	*/
265	#define __MM_TRANSPOSE4x4_PS(r0, r1, r2, r3) \
266	{ \
267	__m128 t3, t2, t1, t0; \
268	\
269	/* r00 r01 r02 r03 */ \
270	/* r10 r11 r12 r13 */ \
271	/* r20 r21 r22 r23 */ \
272	/* r30 r31 r32 r33 */ \
273	\
274	t0 = _mm_unpacklo_ps(r0, r1); /* r00 r10 r01 r11 */ \
275	t2 = _mm_unpackhi_ps(r0, r1); /* r02 r12 r03 r13 */ \
276	t1 = _mm_unpacklo_ps(r2, r3); /* r20 r30 r21 r31 */ \
277	t3 = _mm_unpackhi_ps(r2, r3); /* r22 r32 r23 r33 */ \
278	\
279	r0 = _mm_movelh_ps(t0, t1); /* r00 r10 r20 r30 */ \
280	r1 = _mm_movehl_ps(t1, t0); /* r01 r11 r21 r31 */ \
281	r2 = _mm_movelh_ps(t2, t3); /* r02 r12 r22 r32 */ \
282	r3 = _mm_movehl_ps(t3, t2); /* r03 r13 r23 r33 */ \
283	}
284
285	/** Performing the transpose of a continuous stored rows of a 4x3 matrix to
286	a 3x4 matrix of single precision floating point values.
287	Arguments v0, v1, and v2 are __m128 values whose elements form the
288	corresponding continuous stored rows of a 4x3 matrix.
289	The matrix transpose is returned in arguments v0, v1, and v2, where
290	v0 now holds column 0 of the original matrix, v1 now holds column 1
291	of the original matrix, etc.
292	*/
293	#define __MM_TRANSPOSE4x3_PS(v0, v1, v2) \
294	{ \
295	__m128 t0, t1, t2; \
296	\
297	/* r00 r01 r02 r10 */ \
298	/* r11 r12 r20 r21 */ \
299	/* r22 r30 r31 r32 */ \
300	\
301	t0 = _mm_shuffle_ps(v0, v2, _MM_SHUFFLE(3,0,3,0)); /* r00 r10 r22 r32 */ \
302	t1 = _mm_shuffle_ps(v0, v1, _MM_SHUFFLE(1,0,2,1)); /* r01 r02 r11 r12 */ \
303	t2 = _mm_shuffle_ps(v1, v2, _MM_SHUFFLE(2,1,3,2)); /* r20 r21 r30 r31 */ \
304	\
305	v0 = _mm_shuffle_ps(t0, t2, _MM_SHUFFLE(2,0,1,0)); /* r00 r10 r20 r30 */ \
306	v1 = _mm_shuffle_ps(t1, t2, _MM_SHUFFLE(3,1,2,0)); /* r01 r11 r21 r31 */ \
307	v2 = _mm_shuffle_ps(t1, t0, _MM_SHUFFLE(3,2,3,1)); /* r02 r12 r22 r32 */ \
308	}
309
310	/** Performing the transpose of a 3x4 matrix to a continuous stored rows of
311	a 4x3 matrix of single precision floating point values.
312	Arguments v0, v1, and v2 are __m128 values whose elements form the
313	corresponding columns of a 3x4 matrix.
314	The matrix transpose is returned in arguments v0, v1, and v2, as a
315	continuous stored rows of a 4x3 matrix.
316	*/
317	#define __MM_TRANSPOSE3x4_PS(v0, v1, v2) \
318	{ \
319	__m128 t0, t1, t2; \
320	\
321	/* r00 r10 r20 r30 */ \
322	/* r01 r11 r21 r31 */ \
323	/* r02 r12 r22 r32 */ \
324	\
325	t0 = _mm_shuffle_ps(v0, v2, _MM_SHUFFLE(2,0,3,1)); /* r10 r30 r02 r22 */ \
326	t1 = _mm_shuffle_ps(v1, v2, _MM_SHUFFLE(3,1,3,1)); /* r11 r31 r12 r32 */ \
327	t2 = _mm_shuffle_ps(v0, v1, _MM_SHUFFLE(2,0,2,0)); /* r00 r20 r01 r21 */ \
328	\
329	v0 = _mm_shuffle_ps(t2, t0, _MM_SHUFFLE(0,2,2,0)); /* r00 r01 r02 r10 */ \
330	v1 = _mm_shuffle_ps(t1, t2, _MM_SHUFFLE(3,1,2,0)); /* r11 r12 r20 r21 */ \
331	v2 = _mm_shuffle_ps(t0, t1, _MM_SHUFFLE(3,1,1,3)); /* r22 r30 r31 r32 */ \
332	}
333
334	/** Fill vector of single precision floating point with selected value.
335	Argument 'fp' is a digit[0123] that represents the fp of argument 'v'.
336	*/
337	#define __MM_SELECT(v, fp) \
338	_mm_shuffle_ps((v), (v), _MM_SHUFFLE((fp),(fp),(fp),(fp)))
339
340	/// Accumulate four vector of single precision floating point values.
341	#define __MM_ACCUM4_PS(a, b, c, d) \
342	_mm_add_ps(_mm_add_ps(a, b), _mm_add_ps(c, d))
343
344	/** Performing dot-product between two of four vector of single precision
345	floating point values.
346	*/
347	#define __MM_DOT4x4_PS(a0, a1, a2, a3, b0, b1, b2, b3) \
348	__MM_ACCUM4_PS(_mm_mul_ps(a0, b0), _mm_mul_ps(a1, b1), _mm_mul_ps(a2, b2), _mm_mul_ps(a3, b3))
349
350	/** Performing dot-product between four vector and three vector of single
351	precision floating point values.
352	*/
353	#define __MM_DOT4x3_PS(r0, r1, r2, r3, v0, v1, v2) \
354	__MM_ACCUM4_PS(_mm_mul_ps(r0, v0), _mm_mul_ps(r1, v1), _mm_mul_ps(r2, v2), r3)
355
356	/// Accumulate three vector of single precision floating point values.
357	#define __MM_ACCUM3_PS(a, b, c) \
358	_mm_add_ps(_mm_add_ps(a, b), c)
359
360	/** Performing dot-product between two of three vector of single precision
361	floating point values.
362	*/
363	#define __MM_DOT3x3_PS(r0, r1, r2, v0, v1, v2) \
364	__MM_ACCUM3_PS(_mm_mul_ps(r0, v0), _mm_mul_ps(r1, v1), _mm_mul_ps(r2, v2))
365
366	/// Calculate multiply of two vector and plus another vector
367	#define __MM_MADD_PS(a, b, c) \
368	_mm_add_ps(_mm_mul_ps(a, b), c)
369
370	/// Linear interpolation
371	#define __MM_LERP_PS(t, a, b) \
372	__MM_MADD_PS(_mm_sub_ps(b, a), t, a)
373
374	/// Calculate multiply of two single floating value and plus another floating value
375	#define __MM_MADD_SS(a, b, c) \
376	_mm_add_ss(_mm_mul_ss(a, b), c)
377
378	/// Linear interpolation
379	#define __MM_LERP_SS(t, a, b) \
380	__MM_MADD_SS(_mm_sub_ss(b, a), t, a)
381
382	/// Same as _mm_load_ps, but can help VC generate more optimised code.
383	#define __MM_LOAD_PS(p) \
384	((__m128)(p))
385
386	/// Same as _mm_store_ps, but can help VC generate more optimised code.
387	#define __MM_STORE_PS(p, v) \
388	((__m128)(p) = (v))
389
390
391	/** Helper to load/store SSE data based on whether or not aligned.
392	*/
393	template <bool aligned = false>
394	struct SSEMemoryAccessor
395	{
396	static FORCEINLINE __m128 load(const float *p)
397	{
398	return _mm_loadu_ps(p);
399	}
400	static FORCEINLINE void store(float *p, const __m128& v)
401	{
402	_mm_storeu_ps(p, v);
403	}
404	};
405	// Special aligned accessor
406	template <>
407	struct SSEMemoryAccessor<true>
408	{
409	static FORCEINLINE const __m128& load(const float *p)
410	{
411	return __MM_LOAD_PS(p);
412	}
413	static FORCEINLINE void store(float *p, const __m128& v)
414	{
415	__MM_STORE_PS(p, v);
416	}
417	};
418
419	/** Check whether or not the given pointer perfect aligned for SSE.
420	*/
421	static FORCEINLINE bool _isAlignedForSSE(const void *p)
422	{
423	return (((size_t)p) & 15) == 0;
424	}
425
426	/** Calculate NewtonRaphson Reciprocal Square Root with formula:
427	0.5 * rsqrt(x) * (3 - x * rsqrt(x)^2)
428	*/
429	static FORCEINLINE __m128 __mm_rsqrt_nr_ps(const __m128& x)
430	{
431	static const __m128 v0pt5 = { 0.5f, 0.5f, 0.5f, 0.5f };
432	static const __m128 v3pt0 = { 3.0f, 3.0f, 3.0f, 3.0f };
433	__m128 t = _mm_rsqrt_ps(x);
434	return _mm_mul_ps(_mm_mul_ps(v0pt5, t),
435	_mm_sub_ps(v3pt0, _mm_mul_ps(_mm_mul_ps(x, t), t)));
436	}
437
438	// Macro to check the stack aligned for SSE
439	#if OGRE_DEBUG_MODE
440	#define __OGRE_CHECK_STACK_ALIGNED_FOR_SSE() \
441	{ \
442	__m128 test; \
443	assert(_isAlignedForSSE(&test)); \
444	}
445
446	#else // !OGRE_DEBUG_MODE
447	#define __OGRE_CHECK_STACK_ALIGNED_FOR_SSE()
448
449	#endif // OGRE_DEBUG_MODE
450
451
452	#endif // __OGRE_HAVE_SSE
453
454	}
455
456	#endif // __SIMDHelper_H__

Note: See TracBrowser for help on using the repository browser.

Download in other formats: