/* ----------------------------------------------------------------------------- This source file is part of OGRE (Object-oriented Graphics Rendering Engine) For the latest info, see http://www.ogre3d.org/ Copyright (c) 2000-2006 Torus Knot Software Ltd Also see acknowledgements in Readme.html This program is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with this program; if not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA, or go to http://www.gnu.org/copyleft/lesser.txt. You may alternatively use this source under the terms of a specific version of the OGRE Unrestricted License provided you have obtained such a license from Torus Knot Software Ltd. ----------------------------------------------------------------------------- */ #include "OgreStableHeaders.h" #include "OgreOptimisedUtil.h" #include "OgrePlatformInformation.h" #if __OGRE_HAVE_SSE #include "OgreMatrix4.h" // Should keep this includes at latest to avoid potential "xmmintrin.h" included by // other header file on some platform for some reason. #include "OgreSIMDHelper.h" // I'd like to merge this file with OgreOptimisedUtil.cpp, but it's // impossible when compile with gcc, due SSE instructions can only // enable/disable at file level. //------------------------------------------------------------------------- // // The routines implemented in this file are performance oriented, // which means saving every penny as possible. This requirement might // break some C++/STL-rules. // // // Some rules I'd like to respects: // // 1. Had better use unpacklo/hi, movelh/hl instead of shuffle because // it can saving one byte of binary code :) // 2. Use add/sub instead of mul. // 3. Eliminate prolog code of function call. // // The last, anything recommended by Intel Optimization Reference Manual. // //------------------------------------------------------------------------- // Use unrolled SSE version when vertices exceeds this limit #define OGRE_SSE_SKINNING_UNROLL_VERTICES 16 namespace Ogre { //------------------------------------------------------------------------- // Local classes //------------------------------------------------------------------------- /** SSE implementation of OptimisedUtil. @note Don't use this class directly, use OptimisedUtil instead. */ class _OgrePrivate OptimisedUtilSSE : public OptimisedUtil { protected: /// Does we prefer use general SSE version for position/normals shared buffers? bool mPreferGeneralVersionForSharedBuffers; public: /// Constructor OptimisedUtilSSE(void); /// @copydoc OptimisedUtil::softwareVertexSkinning virtual void softwareVertexSkinning( const float *srcPosPtr, float *destPosPtr, const float *srcNormPtr, float *destNormPtr, const float *blendWeightPtr, const unsigned char* blendIndexPtr, const Matrix4* const* blendMatrices, size_t srcPosStride, size_t destPosStride, size_t srcNormStride, size_t destNormStride, size_t blendWeightStride, size_t blendIndexStride, size_t numWeightsPerVertex, size_t numVertices); /// @copydoc OptimisedUtil::softwareVertexMorph virtual void softwareVertexMorph( Real t, const float *srcPos1, const float *srcPos2, float *dstPos, size_t numVertices); /// @copydoc OptimisedUtil::concatenateAffineMatrices virtual void concatenateAffineMatrices( const Matrix4& baseMatrix, const Matrix4* srcMatrices, Matrix4* dstMatrices, size_t numMatrices); /// @copydoc OptimisedUtil::calculateFaceNormals virtual void calculateFaceNormals( const float *positions, const EdgeData::Triangle *triangles, Vector4 *faceNormals, size_t numTriangles); /// @copydoc OptimisedUtil::calculateLightFacing virtual void calculateLightFacing( const Vector4& lightPos, const Vector4* faceNormals, char* lightFacings, size_t numFaces); /// @copydoc OptimisedUtil::extrudeVertices virtual void extrudeVertices( const Vector4& lightPos, Real extrudeDist, const float* srcPositions, float* destPositions, size_t numVertices); }; #if defined(__OGRE_SIMD_ALIGN_STACK) /** Stack-align implementation of OptimisedUtil. @remarks User code compiled by icc and gcc might not align stack properly, we need ensure stack align to a 16-bytes boundary when execute SSE function. @par We implemeted as align stack following a virtual function call, then should guarantee call instruction are used instead of inline underlying function body here (which might causing problem). @note Don't use this class directly, use OptimisedUtil instead. */ class _OgrePrivate OptimisedUtilWithStackAlign : public OptimisedUtil { protected: /// The actual implementation OptimisedUtil* mImpl; public: /// Constructor OptimisedUtilWithStackAlign(OptimisedUtil* impl) : mImpl(impl) { } /// @copydoc OptimisedUtil::softwareVertexSkinning virtual void softwareVertexSkinning( const float *srcPosPtr, float *destPosPtr, const float *srcNormPtr, float *destNormPtr, const float *blendWeightPtr, const unsigned char* blendIndexPtr, const Matrix4* const* blendMatrices, size_t srcPosStride, size_t destPosStride, size_t srcNormStride, size_t destNormStride, size_t blendWeightStride, size_t blendIndexStride, size_t numWeightsPerVertex, size_t numVertices) { __OGRE_SIMD_ALIGN_STACK(); mImpl->softwareVertexSkinning( srcPosPtr, destPosPtr, srcNormPtr, destNormPtr, blendWeightPtr, blendIndexPtr, blendMatrices, srcPosStride, destPosStride, srcNormStride, destNormStride, blendWeightStride, blendIndexStride, numWeightsPerVertex, numVertices); } /// @copydoc OptimisedUtil::softwareVertexMorph virtual void softwareVertexMorph( Real t, const float *srcPos1, const float *srcPos2, float *dstPos, size_t numVertices) { __OGRE_SIMD_ALIGN_STACK(); mImpl->softwareVertexMorph( t, srcPos1, srcPos2, dstPos, numVertices); } /// @copydoc OptimisedUtil::concatenateAffineMatrices virtual void concatenateAffineMatrices( const Matrix4& baseMatrix, const Matrix4* srcMatrices, Matrix4* dstMatrices, size_t numMatrices) { __OGRE_SIMD_ALIGN_STACK(); mImpl->concatenateAffineMatrices( baseMatrix, srcMatrices, dstMatrices, numMatrices); } /// @copydoc OptimisedUtil::calculateFaceNormals virtual void calculateFaceNormals( const float *positions, const EdgeData::Triangle *triangles, Vector4 *faceNormals, size_t numTriangles) { __OGRE_SIMD_ALIGN_STACK(); mImpl->calculateFaceNormals( positions, triangles, faceNormals, numTriangles); } /// @copydoc OptimisedUtil::calculateLightFacing virtual void calculateLightFacing( const Vector4& lightPos, const Vector4* faceNormals, char* lightFacings, size_t numFaces) { __OGRE_SIMD_ALIGN_STACK(); mImpl->calculateLightFacing( lightPos, faceNormals, lightFacings, numFaces); } /// @copydoc OptimisedUtil::extrudeVertices virtual void extrudeVertices( const Vector4& lightPos, Real extrudeDist, const float* srcPositions, float* destPositions, size_t numVertices) { __OGRE_SIMD_ALIGN_STACK(); mImpl->extrudeVertices( lightPos, extrudeDist, srcPositions, destPositions, numVertices); } }; #endif // !defined(__OGRE_SIMD_ALIGN_STACK) //--------------------------------------------------------------------- // Some useful macro for collapse matrices. //--------------------------------------------------------------------- #define __LOAD_MATRIX(row0, row1, row2, pMatrix) \ { \ row0 = __MM_LOAD_PS((*pMatrix)[0]); \ row1 = __MM_LOAD_PS((*pMatrix)[1]); \ row2 = __MM_LOAD_PS((*pMatrix)[2]); \ } #define __LERP_MATRIX(row0, row1, row2, weight, pMatrix) \ { \ row0 = __MM_LERP_PS(weight, row0, __MM_LOAD_PS((*pMatrix)[0])); \ row1 = __MM_LERP_PS(weight, row1, __MM_LOAD_PS((*pMatrix)[1])); \ row2 = __MM_LERP_PS(weight, row2, __MM_LOAD_PS((*pMatrix)[2])); \ } #define __LOAD_WEIGHTED_MATRIX(row0, row1, row2, weight, pMatrix) \ { \ row0 = _mm_mul_ps(__MM_LOAD_PS((*pMatrix)[0]), weight); \ row1 = _mm_mul_ps(__MM_LOAD_PS((*pMatrix)[1]), weight); \ row2 = _mm_mul_ps(__MM_LOAD_PS((*pMatrix)[2]), weight); \ } #define __ACCUM_WEIGHTED_MATRIX(row0, row1, row2, weight, pMatrix) \ { \ row0 = __MM_MADD_PS(__MM_LOAD_PS((*pMatrix)[0]), weight, row0); \ row1 = __MM_MADD_PS(__MM_LOAD_PS((*pMatrix)[1]), weight, row1); \ row2 = __MM_MADD_PS(__MM_LOAD_PS((*pMatrix)[2]), weight, row2); \ } //--------------------------------------------------------------------- // The following macros request variables declared by caller. // // :) Thank row-major matrix used in Ogre, it make we accessing affine matrix easy. //--------------------------------------------------------------------- /** Collapse one-weighted matrix. Eliminated multiply by weight since the weight should be equal to one always */ #define __COLLAPSE_MATRIX_W1(row0, row1, row2, ppMatrices, pIndices, pWeights) \ { \ pMatrix0 = blendMatrices[pIndices[0]]; \ __LOAD_MATRIX(row0, row1, row2, pMatrix0); \ } /** Collapse two-weighted matrix. Based on the fact that accumulated weights are equal to one, by use lerp, replaced two multiplies and one additive with one multiplie and two additives. */ #define __COLLAPSE_MATRIX_W2(row0, row1, row2, ppMatrices, pIndices, pWeights) \ { \ weight = _mm_load_ps1(pWeights + 1); \ pMatrix0 = ppMatrices[pIndices[0]]; \ __LOAD_MATRIX(row0, row1, row2, pMatrix0); \ pMatrix1 = ppMatrices[pIndices[1]]; \ __LERP_MATRIX(row0, row1, row2, weight, pMatrix1); \ } /** Collapse three-weighted matrix. */ #define __COLLAPSE_MATRIX_W3(row0, row1, row2, ppMatrices, pIndices, pWeights) \ { \ weight = _mm_load_ps1(pWeights + 0); \ pMatrix0 = ppMatrices[pIndices[0]]; \ __LOAD_WEIGHTED_MATRIX(row0, row1, row2, weight, pMatrix0); \ weight = _mm_load_ps1(pWeights + 1); \ pMatrix1 = ppMatrices[pIndices[1]]; \ __ACCUM_WEIGHTED_MATRIX(row0, row1, row2, weight, pMatrix1); \ weight = _mm_load_ps1(pWeights + 2); \ pMatrix2 = ppMatrices[pIndices[2]]; \ __ACCUM_WEIGHTED_MATRIX(row0, row1, row2, weight, pMatrix2); \ } /** Collapse four-weighted matrix. */ #define __COLLAPSE_MATRIX_W4(row0, row1, row2, ppMatrices, pIndices, pWeights) \ { \ /* Load four blend weights at one time, they will be shuffled later */ \ weights = _mm_loadu_ps(pWeights); \ \ pMatrix0 = ppMatrices[pIndices[0]]; \ weight = __MM_SELECT(weights, 0); \ __LOAD_WEIGHTED_MATRIX(row0, row1, row2, weight, pMatrix0); \ pMatrix1 = ppMatrices[pIndices[1]]; \ weight = __MM_SELECT(weights, 1); \ __ACCUM_WEIGHTED_MATRIX(row0, row1, row2, weight, pMatrix1); \ pMatrix2 = ppMatrices[pIndices[2]]; \ weight = __MM_SELECT(weights, 2); \ __ACCUM_WEIGHTED_MATRIX(row0, row1, row2, weight, pMatrix2); \ pMatrix3 = ppMatrices[pIndices[3]]; \ weight = __MM_SELECT(weights, 3); \ __ACCUM_WEIGHTED_MATRIX(row0, row1, row2, weight, pMatrix3); \ } //--------------------------------------------------------------------- // Collapse a matrix at one time. The collapsed matrix are weighted by // blend-weights, and then can use to transform corresponding vertex directly. // // I'd like use inline function instead of macro here, but I also want to // ensure compiler integrate this code into its callers (release build at // least), doesn't matter about specific compile options. Inline function // work fine for VC, but looks like gcc (3.4.4 here) generate function-call // when implemented as inline function, even if compile with "-O3" option. // #define _collapseOneMatrix( \ m00, m01, m02, \ pBlendWeight, pBlendIndex, \ blendMatrices, \ blendWeightStride, blendIndexStride, \ numWeightsPerVertex) \ { \ /* Important Note: If reuse pMatrixXXX frequently, M$ VC7.1 will */ \ /* generate wrong code here!!! */ \ const Matrix4* pMatrix0, *pMatrix1, *pMatrix2, *pMatrix3; \ __m128 weight, weights; \ \ switch (numWeightsPerVertex) \ { \ default: /* Just in case and make compiler happy */ \ case 1: \ __COLLAPSE_MATRIX_W1(m00, m01, m02, blendMatrices, \ rawOffsetPointer(pBlendIndex, 0 * blendIndexStride), \ rawOffsetPointer(pBlendWeight, 0 * blendWeightStride)); \ break; \ \ case 2: \ __COLLAPSE_MATRIX_W2(m00, m01, m02, blendMatrices, \ rawOffsetPointer(pBlendIndex, 0 * blendIndexStride), \ rawOffsetPointer(pBlendWeight, 0 * blendWeightStride)); \ break; \ \ case 3: \ __COLLAPSE_MATRIX_W3(m00, m01, m02, blendMatrices, \ rawOffsetPointer(pBlendIndex, 0 * blendIndexStride), \ rawOffsetPointer(pBlendWeight, 0 * blendWeightStride)); \ break; \ \ case 4: \ __COLLAPSE_MATRIX_W4(m00, m01, m02, blendMatrices, \ rawOffsetPointer(pBlendIndex, 0 * blendIndexStride), \ rawOffsetPointer(pBlendWeight, 0 * blendWeightStride)); \ break; \ } \ } //--------------------------------------------------------------------- // Collapse four matrices at one time. The collapsed matrix are weighted by // blend-weights, and then can use to transform corresponding vertex directly. // // I'd like use inline function instead of macro here, but I also want to // ensure compiler integrate this code into its callers (release build at // least), doesn't matter about specific compile options. Inline function // work fine for VC, but looks like gcc (3.4.4 here) generate function-call // when implemented as inline function, even if compile with "-O3" option. // #define _collapseFourMatrices( \ m00, m01, m02, \ m10, m11, m12, \ m20, m21, m22, \ m30, m31, m32, \ pBlendWeight, pBlendIndex, \ blendMatrices, \ blendWeightStride, blendIndexStride, \ numWeightsPerVertex) \ { \ /* Important Note: If reuse pMatrixXXX frequently, M$ VC7.1 will */ \ /* generate wrong code here!!! */ \ const Matrix4* pMatrix0, *pMatrix1, *pMatrix2, *pMatrix3; \ __m128 weight, weights; \ \ switch (numWeightsPerVertex) \ { \ default: /* Just in case and make compiler happy */ \ case 1: \ __COLLAPSE_MATRIX_W1(m00, m01, m02, blendMatrices, \ rawOffsetPointer(pBlendIndex, 0 * blendIndexStride), \ rawOffsetPointer(pBlendWeight, 0 * blendWeightStride)); \ __COLLAPSE_MATRIX_W1(m10, m11, m12, blendMatrices, \ rawOffsetPointer(pBlendIndex, 1 * blendIndexStride), \ rawOffsetPointer(pBlendWeight, 1 * blendWeightStride)); \ __COLLAPSE_MATRIX_W1(m20, m21, m22, blendMatrices, \ rawOffsetPointer(pBlendIndex, 2 * blendIndexStride), \ rawOffsetPointer(pBlendWeight, 2 * blendWeightStride)); \ __COLLAPSE_MATRIX_W1(m30, m31, m32, blendMatrices, \ rawOffsetPointer(pBlendIndex, 3 * blendIndexStride), \ rawOffsetPointer(pBlendWeight, 3 * blendWeightStride)); \ break; \ \ case 2: \ __COLLAPSE_MATRIX_W2(m00, m01, m02, blendMatrices, \ rawOffsetPointer(pBlendIndex, 0 * blendIndexStride), \ rawOffsetPointer(pBlendWeight, 0 * blendWeightStride)); \ __COLLAPSE_MATRIX_W2(m10, m11, m12, blendMatrices, \ rawOffsetPointer(pBlendIndex, 1 * blendIndexStride), \ rawOffsetPointer(pBlendWeight, 1 * blendWeightStride)); \ __COLLAPSE_MATRIX_W2(m20, m21, m22, blendMatrices, \ rawOffsetPointer(pBlendIndex, 2 * blendIndexStride), \ rawOffsetPointer(pBlendWeight, 2 * blendWeightStride)); \ __COLLAPSE_MATRIX_W2(m30, m31, m32, blendMatrices, \ rawOffsetPointer(pBlendIndex, 3 * blendIndexStride), \ rawOffsetPointer(pBlendWeight, 3 * blendWeightStride)); \ break; \ \ case 3: \ __COLLAPSE_MATRIX_W3(m00, m01, m02, blendMatrices, \ rawOffsetPointer(pBlendIndex, 0 * blendIndexStride), \ rawOffsetPointer(pBlendWeight, 0 * blendWeightStride)); \ __COLLAPSE_MATRIX_W3(m10, m11, m12, blendMatrices, \ rawOffsetPointer(pBlendIndex, 1 * blendIndexStride), \ rawOffsetPointer(pBlendWeight, 1 * blendWeightStride)); \ __COLLAPSE_MATRIX_W3(m20, m21, m22, blendMatrices, \ rawOffsetPointer(pBlendIndex, 2 * blendIndexStride), \ rawOffsetPointer(pBlendWeight, 2 * blendWeightStride)); \ __COLLAPSE_MATRIX_W3(m30, m31, m32, blendMatrices, \ rawOffsetPointer(pBlendIndex, 3 * blendIndexStride), \ rawOffsetPointer(pBlendWeight, 3 * blendWeightStride)); \ break; \ \ case 4: \ __COLLAPSE_MATRIX_W4(m00, m01, m02, blendMatrices, \ rawOffsetPointer(pBlendIndex, 0 * blendIndexStride), \ rawOffsetPointer(pBlendWeight, 0 * blendWeightStride)); \ __COLLAPSE_MATRIX_W4(m10, m11, m12, blendMatrices, \ rawOffsetPointer(pBlendIndex, 1 * blendIndexStride), \ rawOffsetPointer(pBlendWeight, 1 * blendWeightStride)); \ __COLLAPSE_MATRIX_W4(m20, m21, m22, blendMatrices, \ rawOffsetPointer(pBlendIndex, 2 * blendIndexStride), \ rawOffsetPointer(pBlendWeight, 2 * blendWeightStride)); \ __COLLAPSE_MATRIX_W4(m30, m31, m32, blendMatrices, \ rawOffsetPointer(pBlendIndex, 3 * blendIndexStride), \ rawOffsetPointer(pBlendWeight, 3 * blendWeightStride)); \ break; \ } \ } //--------------------------------------------------------------------- // General SSE version skinning positions, and optional skinning normals. static void softwareVertexSkinning_SSE_General( const float *pSrcPos, float *pDestPos, const float *pSrcNorm, float *pDestNorm, const float *pBlendWeight, const unsigned char* pBlendIndex, const Matrix4* const* blendMatrices, size_t srcPosStride, size_t destPosStride, size_t srcNormStride, size_t destNormStride, size_t blendWeightStride, size_t blendIndexStride, size_t numWeightsPerVertex, size_t numVertices) { for (size_t i = 0; i < numVertices; ++i) { // Collapse matrices __m128 m00, m01, m02; _collapseOneMatrix( m00, m01, m02, pBlendWeight, pBlendIndex, blendMatrices, blendWeightStride, blendIndexStride, numWeightsPerVertex); // Advance blend weight and index pointers advanceRawPointer(pBlendWeight, blendWeightStride); advanceRawPointer(pBlendIndex, blendIndexStride); //------------------------------------------------------------------ // Rearrange to column-major matrix with rows shuffled order to: Z 0 X Y __m128 m03 = _mm_setzero_ps(); __MM_TRANSPOSE4x4_PS(m02, m03, m00, m01); //------------------------------------------------------------------ // Transform position //------------------------------------------------------------------ __m128 s0, s1, s2; // Load source position s0 = _mm_load_ps1(pSrcPos + 0); s1 = _mm_load_ps1(pSrcPos + 1); s2 = _mm_load_ps1(pSrcPos + 2); // Transform by collapsed matrix __m128 accumPos = __MM_DOT4x3_PS(m02, m03, m00, m01, s0, s1, s2); // z 0 x y // Store blended position, no aligned requirement _mm_storeh_pi((__m64*)pDestPos, accumPos); _mm_store_ss(pDestPos+2, accumPos); // Advance source and target position pointers advanceRawPointer(pSrcPos, srcPosStride); advanceRawPointer(pDestPos, destPosStride); //------------------------------------------------------------------ // Optional blend normal //------------------------------------------------------------------ if (pSrcNorm) { // Load source normal s0 = _mm_load_ps1(pSrcNorm + 0); s1 = _mm_load_ps1(pSrcNorm + 1); s2 = _mm_load_ps1(pSrcNorm + 2); // Transform by collapsed matrix __m128 accumNorm = __MM_DOT3x3_PS(m02, m03, m00, s0, s1, s2); // z 0 x y // Normalise normal __m128 tmp = _mm_mul_ps(accumNorm, accumNorm); // z^2 0 x^2 y^2 tmp = __MM_ACCUM3_PS(tmp, _mm_shuffle_ps(tmp, tmp, _MM_SHUFFLE(0,3,1,2)), // x^2 0 y^2 z^2 _mm_shuffle_ps(tmp, tmp, _MM_SHUFFLE(2,0,1,3))); // y^2 0 z^2 x^2 // Note: zero divided here, but neglectable tmp = __MM_RSQRT_PS(tmp); accumNorm = _mm_mul_ps(accumNorm, tmp); // Store blended normal, no aligned requirement _mm_storeh_pi((__m64*)pDestNorm, accumNorm); _mm_store_ss(pDestNorm+2, accumNorm); // Advance source and target normal pointers advanceRawPointer(pSrcNorm, srcNormStride); advanceRawPointer(pDestNorm, destNormStride); } } } //--------------------------------------------------------------------- // Special SSE version skinning shared buffers of position and normal, // and the buffer are packed. template struct SoftwareVertexSkinning_SSE_PosNorm_Shared_Packed { static void apply( const float* pSrc, float* pDest, const float* pBlendWeight, const unsigned char* pBlendIndex, const Matrix4* const* blendMatrices, size_t blendWeightStride, size_t blendIndexStride, size_t numWeightsPerVertex, size_t numIterations) { typedef SSEMemoryAccessor SrcAccessor; typedef SSEMemoryAccessor DestAccessor; // Blending 4 vertices per-iteration for (size_t i = 0; i < numIterations; ++i) { // Collapse matrices __m128 m00, m01, m02, m10, m11, m12, m20, m21, m22, m30, m31, m32; _collapseFourMatrices( m00, m01, m02, m10, m11, m12, m20, m21, m22, m30, m31, m32, pBlendWeight, pBlendIndex, blendMatrices, blendWeightStride, blendIndexStride, numWeightsPerVertex); // Advance 4 vertices advanceRawPointer(pBlendWeight, 4 * blendWeightStride); advanceRawPointer(pBlendIndex, 4 * blendIndexStride); //------------------------------------------------------------------ // Transform position/normals //------------------------------------------------------------------ __m128 s0, s1, s2, s3, s4, s5, d0, d1, d2, d3, d4, d5; __m128 t0, t1, t2, t3, t4, t5; // Load source position/normals s0 = SrcAccessor::load(pSrc + 0); // px0 py0 pz0 nx0 s1 = SrcAccessor::load(pSrc + 4); // ny0 nz0 px1 py1 s2 = SrcAccessor::load(pSrc + 8); // pz1 nx1 ny1 nz1 s3 = SrcAccessor::load(pSrc + 12); // px2 py2 pz2 nx2 s4 = SrcAccessor::load(pSrc + 16); // ny2 nz2 px3 py3 s5 = SrcAccessor::load(pSrc + 20); // pz3 nx3 ny3 nz3 // Rearrange to component-major for batches calculate. t0 = _mm_unpacklo_ps(s0, s3); // px0 px2 py0 py2 t1 = _mm_unpackhi_ps(s0, s3); // pz0 pz2 nx0 nx2 t2 = _mm_unpacklo_ps(s1, s4); // ny0 ny2 nz0 nz2 t3 = _mm_unpackhi_ps(s1, s4); // px1 px3 py1 py3 t4 = _mm_unpacklo_ps(s2, s5); // pz1 pz3 nx1 nx3 t5 = _mm_unpackhi_ps(s2, s5); // ny1 ny3 nz1 nz3 s0 = _mm_unpacklo_ps(t0, t3); // px0 px1 px2 px3 s1 = _mm_unpackhi_ps(t0, t3); // py0 py1 py2 py3 s2 = _mm_unpacklo_ps(t1, t4); // pz0 pz1 pz2 pz3 s3 = _mm_unpackhi_ps(t1, t4); // nx0 nx1 nx2 nx3 s4 = _mm_unpacklo_ps(t2, t5); // ny0 ny1 ny2 ny3 s5 = _mm_unpackhi_ps(t2, t5); // nz0 nz1 nz2 nz3 // Transform by collapsed matrix // Shuffle row 0 of four collapsed matrices for calculate X component __MM_TRANSPOSE4x4_PS(m00, m10, m20, m30); // Transform X components d0 = __MM_DOT4x3_PS(m00, m10, m20, m30, s0, s1, s2); // PX0 PX1 PX2 PX3 d3 = __MM_DOT3x3_PS(m00, m10, m20, s3, s4, s5); // NX0 NX1 NX2 NX3 // Shuffle row 1 of four collapsed matrices for calculate Y component __MM_TRANSPOSE4x4_PS(m01, m11, m21, m31); // Transform Y components d1 = __MM_DOT4x3_PS(m01, m11, m21, m31, s0, s1, s2); // PY0 PY1 PY2 PY3 d4 = __MM_DOT3x3_PS(m01, m11, m21, s3, s4, s5); // NY0 NY1 NY2 NY3 // Shuffle row 2 of four collapsed matrices for calculate Z component __MM_TRANSPOSE4x4_PS(m02, m12, m22, m32); // Transform Z components d2 = __MM_DOT4x3_PS(m02, m12, m22, m32, s0, s1, s2); // PZ0 PZ1 PZ2 PZ3 d5 = __MM_DOT3x3_PS(m02, m12, m22, s3, s4, s5); // NZ0 NZ1 NZ2 NZ3 // Normalise normals __m128 tmp = __MM_DOT3x3_PS(d3, d4, d5, d3, d4, d5); tmp = __MM_RSQRT_PS(tmp); d3 = _mm_mul_ps(d3, tmp); d4 = _mm_mul_ps(d4, tmp); d5 = _mm_mul_ps(d5, tmp); // Arrange back to continuous format for store results t0 = _mm_unpacklo_ps(d0, d1); // PX0 PY0 PX1 PY1 t1 = _mm_unpackhi_ps(d0, d1); // PX2 PY2 PX3 PY3 t2 = _mm_unpacklo_ps(d2, d3); // PZ0 NX0 PZ1 NX1 t3 = _mm_unpackhi_ps(d2, d3); // PZ2 NX2 PZ3 NX3 t4 = _mm_unpacklo_ps(d4, d5); // NY0 NZ0 NY1 NZ1 t5 = _mm_unpackhi_ps(d4, d5); // NY2 NZ2 NY3 NZ3 d0 = _mm_movelh_ps(t0, t2); // PX0 PY0 PZ0 NX0 d1 = _mm_shuffle_ps(t4, t0, _MM_SHUFFLE(3,2,1,0)); // NY0 NZ0 PX1 PY1 d2 = _mm_movehl_ps(t4, t2); // PZ1 NX1 NY1 NZ1 d3 = _mm_movelh_ps(t1, t3); // PX2 PY2 PZ2 NX2 d4 = _mm_shuffle_ps(t5, t1, _MM_SHUFFLE(3,2,1,0)); // NY2 NZ2 PX3 PY3 d5 = _mm_movehl_ps(t5, t3); // PZ3 NX3 NY3 NZ3 // Store blended position/normals DestAccessor::store(pDest + 0, d0); DestAccessor::store(pDest + 4, d1); DestAccessor::store(pDest + 8, d2); DestAccessor::store(pDest + 12, d3); DestAccessor::store(pDest + 16, d4); DestAccessor::store(pDest + 20, d5); // Advance 4 vertices pSrc += 4 * (3 + 3); pDest += 4 * (3 + 3); } } }; static FORCEINLINE void softwareVertexSkinning_SSE_PosNorm_Shared_Packed( const float* pSrcPos, float* pDestPos, const float* pBlendWeight, const unsigned char* pBlendIndex, const Matrix4* const* blendMatrices, size_t blendWeightStride, size_t blendIndexStride, size_t numWeightsPerVertex, size_t numIterations) { // pSrcPos might can't align to 16 bytes because 8 bytes alignment shift per-vertex // Instantiating two version only, since other alignement combination not that important. if (_isAlignedForSSE(pSrcPos) && _isAlignedForSSE(pDestPos)) { SoftwareVertexSkinning_SSE_PosNorm_Shared_Packed::apply( pSrcPos, pDestPos, pBlendWeight, pBlendIndex, blendMatrices, blendWeightStride, blendIndexStride, numWeightsPerVertex, numIterations); } else { SoftwareVertexSkinning_SSE_PosNorm_Shared_Packed::apply( pSrcPos, pDestPos, pBlendWeight, pBlendIndex, blendMatrices, blendWeightStride, blendIndexStride, numWeightsPerVertex, numIterations); } } //--------------------------------------------------------------------- // Special SSE version skinning separated buffers of position and normal, // both of position and normal buffer are packed. template struct SoftwareVertexSkinning_SSE_PosNorm_Separated_Packed { static void apply( const float* pSrcPos, float* pDestPos, const float* pSrcNorm, float* pDestNorm, const float* pBlendWeight, const unsigned char* pBlendIndex, const Matrix4* const* blendMatrices, size_t blendWeightStride, size_t blendIndexStride, size_t numWeightsPerVertex, size_t numIterations) { typedef SSEMemoryAccessor SrcPosAccessor; typedef SSEMemoryAccessor DestPosAccessor; typedef SSEMemoryAccessor SrcNormAccessor; typedef SSEMemoryAccessor DestNormAccessor; // Blending 4 vertices per-iteration for (size_t i = 0; i < numIterations; ++i) { // Collapse matrices __m128 m00, m01, m02, m10, m11, m12, m20, m21, m22, m30, m31, m32; _collapseFourMatrices( m00, m01, m02, m10, m11, m12, m20, m21, m22, m30, m31, m32, pBlendWeight, pBlendIndex, blendMatrices, blendWeightStride, blendIndexStride, numWeightsPerVertex); // Advance 4 vertices advanceRawPointer(pBlendWeight, 4 * blendWeightStride); advanceRawPointer(pBlendIndex, 4 * blendIndexStride); //------------------------------------------------------------------ // Transform positions //------------------------------------------------------------------ __m128 s0, s1, s2, d0, d1, d2; // Load source positions s0 = SrcPosAccessor::load(pSrcPos + 0); // x0 y0 z0 x1 s1 = SrcPosAccessor::load(pSrcPos + 4); // y1 z1 x2 y2 s2 = SrcPosAccessor::load(pSrcPos + 8); // z2 x3 y3 z3 // Arrange to 3x4 component-major for batches calculate __MM_TRANSPOSE4x3_PS(s0, s1, s2); // Transform by collapsed matrix // Shuffle row 0 of four collapsed matrices for calculate X component __MM_TRANSPOSE4x4_PS(m00, m10, m20, m30); // Transform X components d0 = __MM_DOT4x3_PS(m00, m10, m20, m30, s0, s1, s2); // X0 X1 X2 X3 // Shuffle row 1 of four collapsed matrices for calculate Y component __MM_TRANSPOSE4x4_PS(m01, m11, m21, m31); // Transform Y components d1 = __MM_DOT4x3_PS(m01, m11, m21, m31, s0, s1, s2); // Y0 Y1 Y2 Y3 // Shuffle row 2 of four collapsed matrices for calculate Z component __MM_TRANSPOSE4x4_PS(m02, m12, m22, m32); // Transform Z components d2 = __MM_DOT4x3_PS(m02, m12, m22, m32, s0, s1, s2); // Z0 Z1 Z2 Z3 // Arrange back to 4x3 continuous format for store results __MM_TRANSPOSE3x4_PS(d0, d1, d2); // Store blended positions DestPosAccessor::store(pDestPos + 0, d0); DestPosAccessor::store(pDestPos + 4, d1); DestPosAccessor::store(pDestPos + 8, d2); // Advance 4 vertices pSrcPos += 4 * 3; pDestPos += 4 * 3; //------------------------------------------------------------------ // Transform normals //------------------------------------------------------------------ // Load source normals s0 = SrcNormAccessor::load(pSrcNorm + 0); // x0 y0 z0 x1 s1 = SrcNormAccessor::load(pSrcNorm + 4); // y1 z1 x2 y2 s2 = SrcNormAccessor::load(pSrcNorm + 8); // z2 x3 y3 z3 // Arrange to 3x4 component-major for batches calculate __MM_TRANSPOSE4x3_PS(s0, s1, s2); // Transform by collapsed and shuffled matrices d0 = __MM_DOT3x3_PS(m00, m10, m20, s0, s1, s2); // X0 X1 X2 X3 d1 = __MM_DOT3x3_PS(m01, m11, m21, s0, s1, s2); // Y0 Y1 Y2 Y3 d2 = __MM_DOT3x3_PS(m02, m12, m22, s0, s1, s2); // Z0 Z1 Z2 Z3 // Normalise normals __m128 tmp = __MM_DOT3x3_PS(d0, d1, d2, d0, d1, d2); tmp = __MM_RSQRT_PS(tmp); d0 = _mm_mul_ps(d0, tmp); d1 = _mm_mul_ps(d1, tmp); d2 = _mm_mul_ps(d2, tmp); // Arrange back to 4x3 continuous format for store results __MM_TRANSPOSE3x4_PS(d0, d1, d2); // Store blended normals DestNormAccessor::store(pDestNorm + 0, d0); DestNormAccessor::store(pDestNorm + 4, d1); DestNormAccessor::store(pDestNorm + 8, d2); // Advance 4 vertices pSrcNorm += 4 * 3; pDestNorm += 4 * 3; } } }; static FORCEINLINE void softwareVertexSkinning_SSE_PosNorm_Separated_Packed( const float* pSrcPos, float* pDestPos, const float* pSrcNorm, float* pDestNorm, const float* pBlendWeight, const unsigned char* pBlendIndex, const Matrix4* const* blendMatrices, size_t blendWeightStride, size_t blendIndexStride, size_t numWeightsPerVertex, size_t numIterations) { assert(_isAlignedForSSE(pSrcPos)); // Instantiating two version only, since other alignement combination not that important. if (_isAlignedForSSE(pSrcNorm) && _isAlignedForSSE(pDestPos) && _isAlignedForSSE(pDestNorm)) { SoftwareVertexSkinning_SSE_PosNorm_Separated_Packed::apply( pSrcPos, pDestPos, pSrcNorm, pDestNorm, pBlendWeight, pBlendIndex, blendMatrices, blendWeightStride, blendIndexStride, numWeightsPerVertex, numIterations); } else { SoftwareVertexSkinning_SSE_PosNorm_Separated_Packed::apply( pSrcPos, pDestPos, pSrcNorm, pDestNorm, pBlendWeight, pBlendIndex, blendMatrices, blendWeightStride, blendIndexStride, numWeightsPerVertex, numIterations); } } //--------------------------------------------------------------------- // Special SSE version skinning position only, the position buffer are // packed. template struct SoftwareVertexSkinning_SSE_PosOnly_Packed { static void apply( const float* pSrcPos, float* pDestPos, const float* pBlendWeight, const unsigned char* pBlendIndex, const Matrix4* const* blendMatrices, size_t blendWeightStride, size_t blendIndexStride, size_t numWeightsPerVertex, size_t numIterations) { typedef SSEMemoryAccessor SrcPosAccessor; typedef SSEMemoryAccessor DestPosAccessor; // Blending 4 vertices per-iteration for (size_t i = 0; i < numIterations; ++i) { // Collapse matrices __m128 m00, m01, m02, m10, m11, m12, m20, m21, m22, m30, m31, m32; _collapseFourMatrices( m00, m01, m02, m10, m11, m12, m20, m21, m22, m30, m31, m32, pBlendWeight, pBlendIndex, blendMatrices, blendWeightStride, blendIndexStride, numWeightsPerVertex); // Advance 4 vertices advanceRawPointer(pBlendWeight, 4 * blendWeightStride); advanceRawPointer(pBlendIndex, 4 * blendIndexStride); //------------------------------------------------------------------ // Transform positions //------------------------------------------------------------------ __m128 s0, s1, s2, d0, d1, d2; // Load source positions s0 = SrcPosAccessor::load(pSrcPos + 0); // x0 y0 z0 x1 s1 = SrcPosAccessor::load(pSrcPos + 4); // y1 z1 x2 y2 s2 = SrcPosAccessor::load(pSrcPos + 8); // z2 x3 y3 z3 // Arrange to 3x4 component-major for batches calculate __MM_TRANSPOSE4x3_PS(s0, s1, s2); // Transform by collapsed matrix // Shuffle row 0 of four collapsed matrices for calculate X component __MM_TRANSPOSE4x4_PS(m00, m10, m20, m30); // Transform X components d0 = __MM_DOT4x3_PS(m00, m10, m20, m30, s0, s1, s2); // X0 X1 X2 X3 // Shuffle row 1 of four collapsed matrices for calculate Y component __MM_TRANSPOSE4x4_PS(m01, m11, m21, m31); // Transform Y components d1 = __MM_DOT4x3_PS(m01, m11, m21, m31, s0, s1, s2); // Y0 Y1 Y2 Y3 // Shuffle row 2 of four collapsed matrices for calculate Z component __MM_TRANSPOSE4x4_PS(m02, m12, m22, m32); // Transform Z components d2 = __MM_DOT4x3_PS(m02, m12, m22, m32, s0, s1, s2); // Z0 Z1 Z2 Z3 // Arrange back to 4x3 continuous format for store results __MM_TRANSPOSE3x4_PS(d0, d1, d2); // Store blended positions DestPosAccessor::store(pDestPos + 0, d0); DestPosAccessor::store(pDestPos + 4, d1); DestPosAccessor::store(pDestPos + 8, d2); // Advance 4 vertices pSrcPos += 4 * 3; pDestPos += 4 * 3; } } }; static FORCEINLINE void softwareVertexSkinning_SSE_PosOnly_Packed( const float* pSrcPos, float* pDestPos, const float* pBlendWeight, const unsigned char* pBlendIndex, const Matrix4* const* blendMatrices, size_t blendWeightStride, size_t blendIndexStride, size_t numWeightsPerVertex, size_t numIterations) { assert(_isAlignedForSSE(pSrcPos)); // Instantiating two version only, since other alignement combination not that important. if (_isAlignedForSSE(pDestPos)) { SoftwareVertexSkinning_SSE_PosOnly_Packed::apply( pSrcPos, pDestPos, pBlendWeight, pBlendIndex, blendMatrices, blendWeightStride, blendIndexStride, numWeightsPerVertex, numIterations); } else { SoftwareVertexSkinning_SSE_PosOnly_Packed::apply( pSrcPos, pDestPos, pBlendWeight, pBlendIndex, blendMatrices, blendWeightStride, blendIndexStride, numWeightsPerVertex, numIterations); } } //--------------------------------------------------------------------- //--------------------------------------------------------------------- //--------------------------------------------------------------------- OptimisedUtilSSE::OptimisedUtilSSE(void) : mPreferGeneralVersionForSharedBuffers(false) { // For AMD Athlon XP (but not that for Althon 64), it's prefer to never use // unrolled version for shared buffers at all, I guess because that version // run out of usable CPU registers, or L1/L2 cache related problem, causing // slight performance loss than general version. // if (PlatformInformation::getCpuIdentifier().find("AuthenticAMD") != String::npos) { // How can I check it's an Athlon XP but not Althon 64? // Ok, just test whether supports SSE2/SSE3 or not, if not, // assume general version faster than unrolled version :) // if (!(PlatformInformation::getCpuFeatures() & (PlatformInformation::CPU_FEATURE_SSE2 | PlatformInformation::CPU_FEATURE_SSE3))) { mPreferGeneralVersionForSharedBuffers = true; } } } //--------------------------------------------------------------------- void OptimisedUtilSSE::softwareVertexSkinning( const float *pSrcPos, float *pDestPos, const float *pSrcNorm, float *pDestNorm, const float *pBlendWeight, const unsigned char* pBlendIndex, const Matrix4* const* blendMatrices, size_t srcPosStride, size_t destPosStride, size_t srcNormStride, size_t destNormStride, size_t blendWeightStride, size_t blendIndexStride, size_t numWeightsPerVertex, size_t numVertices) { __OGRE_CHECK_STACK_ALIGNED_FOR_SSE(); // All position/normal pointers should be perfect aligned, but still check here // for avoid hardware buffer which allocated by potential buggy driver doesn't // support alignment properly. // Because we are used meta-function technique here, the code is easy to maintenance // and still provides all possible alignment combination. // // Use unrolled routines only if there a lot of vertices if (numVertices > OGRE_SSE_SKINNING_UNROLL_VERTICES) { if (pSrcNorm) { // Blend position and normal if (!mPreferGeneralVersionForSharedBuffers && srcPosStride == sizeof(float) * (3 + 3) && destPosStride == sizeof(float) * (3 + 3) && pSrcNorm == pSrcPos + 3 && pDestNorm == pDestPos + 3) { // Position and normal are sharing with packed buffer size_t srcPosAlign = (size_t)pSrcPos & 15; assert((srcPosAlign & 3) == 0); // Blend unaligned vertices with general SIMD routine if (srcPosAlign == 8) // Because 8 bytes alignment shift per-vertex { size_t count = srcPosAlign / 8; numVertices -= count; softwareVertexSkinning_SSE_General( pSrcPos, pDestPos, pSrcNorm, pDestNorm, pBlendWeight, pBlendIndex, blendMatrices, srcPosStride, destPosStride, srcNormStride, destNormStride, blendWeightStride, blendIndexStride, numWeightsPerVertex, count); pSrcPos += count * (3 + 3); pDestPos += count * (3 + 3); pSrcNorm += count * (3 + 3); pDestNorm += count * (3 + 3); advanceRawPointer(pBlendWeight, count * blendWeightStride); advanceRawPointer(pBlendIndex, count * blendIndexStride); } // Blend vertices, four vertices per-iteration size_t numIterations = numVertices / 4; softwareVertexSkinning_SSE_PosNorm_Shared_Packed( pSrcPos, pDestPos, pBlendWeight, pBlendIndex, blendMatrices, blendWeightStride, blendIndexStride, numWeightsPerVertex, numIterations); // Advance pointers for remaining vertices numVertices &= 3; if (numVertices) { pSrcPos += numIterations * 4 * (3 + 3); pDestPos += numIterations * 4 * (3 + 3); pSrcNorm += numIterations * 4 * (3 + 3); pDestNorm += numIterations * 4 * (3 + 3); advanceRawPointer(pBlendWeight, numIterations * 4 * blendWeightStride); advanceRawPointer(pBlendIndex, numIterations * 4 * blendIndexStride); } } else if (srcPosStride == sizeof(float) * 3 && destPosStride == sizeof(float) * 3 && srcNormStride == sizeof(float) * 3 && destNormStride == sizeof(float) * 3) { // Position and normal are separate buffers, and all of them are packed size_t srcPosAlign = (size_t)pSrcPos & 15; assert((srcPosAlign & 3) == 0); // Blend unaligned vertices with general SIMD routine if (srcPosAlign) { size_t count = srcPosAlign / 4; numVertices -= count; softwareVertexSkinning_SSE_General( pSrcPos, pDestPos, pSrcNorm, pDestNorm, pBlendWeight, pBlendIndex, blendMatrices, srcPosStride, destPosStride, srcNormStride, destNormStride, blendWeightStride, blendIndexStride, numWeightsPerVertex, count); pSrcPos += count * 3; pDestPos += count * 3; pSrcNorm += count * 3; pDestNorm += count * 3; advanceRawPointer(pBlendWeight, count * blendWeightStride); advanceRawPointer(pBlendIndex, count * blendIndexStride); } // Blend vertices, four vertices per-iteration size_t numIterations = numVertices / 4; softwareVertexSkinning_SSE_PosNorm_Separated_Packed( pSrcPos, pDestPos, pSrcNorm, pDestNorm, pBlendWeight, pBlendIndex, blendMatrices, blendWeightStride, blendIndexStride, numWeightsPerVertex, numIterations); // Advance pointers for remaining vertices numVertices &= 3; if (numVertices) { pSrcPos += numIterations * 4 * 3; pDestPos += numIterations * 4 * 3; pSrcNorm += numIterations * 4 * 3; pDestNorm += numIterations * 4 * 3; advanceRawPointer(pBlendWeight, numIterations * 4 * blendWeightStride); advanceRawPointer(pBlendIndex, numIterations * 4 * blendIndexStride); } } else // Not 'packed' form or wrong order between position and normal { // Should never occuring, do nothing here just in case } } else // !pSrcNorm { // Blend position only if (srcPosStride == sizeof(float) * 3 && destPosStride == sizeof(float) * 3) { // All buffers are packed size_t srcPosAlign = (size_t)pSrcPos & 15; assert((srcPosAlign & 3) == 0); // Blend unaligned vertices with general SIMD routine if (srcPosAlign) { size_t count = srcPosAlign / 4; numVertices -= count; softwareVertexSkinning_SSE_General( pSrcPos, pDestPos, pSrcNorm, pDestNorm, pBlendWeight, pBlendIndex, blendMatrices, srcPosStride, destPosStride, srcNormStride, destNormStride, blendWeightStride, blendIndexStride, numWeightsPerVertex, count); pSrcPos += count * 3; pDestPos += count * 3; advanceRawPointer(pBlendWeight, count * blendWeightStride); advanceRawPointer(pBlendIndex, count * blendIndexStride); } // Blend vertices, four vertices per-iteration size_t numIterations = numVertices / 4; softwareVertexSkinning_SSE_PosOnly_Packed( pSrcPos, pDestPos, pBlendWeight, pBlendIndex, blendMatrices, blendWeightStride, blendIndexStride, numWeightsPerVertex, numIterations); // Advance pointers for remaining vertices numVertices &= 3; if (numVertices) { pSrcPos += numIterations * 4 * 3; pDestPos += numIterations * 4 * 3; advanceRawPointer(pBlendWeight, numIterations * 4 * blendWeightStride); advanceRawPointer(pBlendIndex, numIterations * 4 * blendIndexStride); } } else // Not 'packed' form { // Might occuring only if user forced software blending position only } } } // Blend remaining vertices, need to do it with SIMD for identical result, // since mixing general floating-point and SIMD algorithm will causing // floating-point error. if (numVertices) { softwareVertexSkinning_SSE_General( pSrcPos, pDestPos, pSrcNorm, pDestNorm, pBlendWeight, pBlendIndex, blendMatrices, srcPosStride, destPosStride, srcNormStride, destNormStride, blendWeightStride, blendIndexStride, numWeightsPerVertex, numVertices); } } //--------------------------------------------------------------------- void OptimisedUtilSSE::softwareVertexMorph( Real t, const float *pSrc1, const float *pSrc2, float *pDst, size_t numVertices) { __OGRE_CHECK_STACK_ALIGNED_FOR_SSE(); __m128 src01, src02, src11, src12, src21, src22; __m128 dst0, dst1, dst2; __m128 t4 = _mm_load_ps1(&t); size_t numIterations = numVertices / 4; numVertices &= 3; // Never use meta-function technique to accessing memory because looks like // VC7.1 generate a bit inefficient binary code when put following code into // inline function. if (_isAlignedForSSE(pSrc1) && _isAlignedForSSE(pSrc2) && _isAlignedForSSE(pDst)) { // All data aligned // Morph 4 vertices per-iteration. Special designed for use all // available CPU registers as possible (7 registers used here), // and avoid temporary values allocated in stack for suppress // extra memory access. for (size_t i = 0; i < numIterations; ++i) { // 12 floating-point values src01 = __MM_LOAD_PS(pSrc1 + 0); src02 = __MM_LOAD_PS(pSrc2 + 0); src11 = __MM_LOAD_PS(pSrc1 + 4); src12 = __MM_LOAD_PS(pSrc2 + 4); src21 = __MM_LOAD_PS(pSrc1 + 8); src22 = __MM_LOAD_PS(pSrc2 + 8); pSrc1 += 12; pSrc2 += 12; dst0 = __MM_LERP_PS(t4, src01, src02); dst1 = __MM_LERP_PS(t4, src11, src12); dst2 = __MM_LERP_PS(t4, src21, src22); __MM_STORE_PS(pDst + 0, dst0); __MM_STORE_PS(pDst + 4, dst1); __MM_STORE_PS(pDst + 8, dst2); pDst += 12; } // Morph remaining vertices switch (numVertices) { case 3: // 9 floating-point values src01 = __MM_LOAD_PS(pSrc1 + 0); src02 = __MM_LOAD_PS(pSrc2 + 0); src11 = __MM_LOAD_PS(pSrc1 + 4); src12 = __MM_LOAD_PS(pSrc2 + 4); src21 = _mm_load_ss(pSrc1 + 8); src22 = _mm_load_ss(pSrc2 + 8); dst0 = __MM_LERP_PS(t4, src01, src02); dst1 = __MM_LERP_PS(t4, src11, src12); dst2 = __MM_LERP_SS(t4, src21, src22); __MM_STORE_PS(pDst + 0, dst0); __MM_STORE_PS(pDst + 4, dst1); _mm_store_ss(pDst + 8, dst2); break; case 2: // 6 floating-point values src01 = __MM_LOAD_PS(pSrc1 + 0); src02 = __MM_LOAD_PS(pSrc2 + 0); src11 = _mm_loadl_pi(t4, (__m64*)(pSrc1 + 4)); // t4 is meaningless here src12 = _mm_loadl_pi(t4, (__m64*)(pSrc2 + 4)); // t4 is meaningless here dst0 = __MM_LERP_PS(t4, src01, src02); dst1 = __MM_LERP_PS(t4, src11, src12); __MM_STORE_PS(pDst + 0, dst0); _mm_storel_pi((__m64*)(pDst + 4), dst1); break; case 1: // 3 floating-point values src01 = _mm_load_ss(pSrc1 + 2); src02 = _mm_load_ss(pSrc2 + 2); src01 = _mm_loadh_pi(src01, (__m64*)(pSrc1 + 0)); src02 = _mm_loadh_pi(src02, (__m64*)(pSrc2 + 0)); dst0 = __MM_LERP_PS(t4, src01, src02); _mm_storeh_pi((__m64*)(pDst + 0), dst0); _mm_store_ss(pDst + 2, dst0); break; } } else // Should never occuring, just in case buggy driver { // Assume all data unaligned // Morph 4 vertices per-iteration. Special designed for use all // available CPU registers as possible (7 registers used here), // and avoid temporary values allocated in stack for suppress // extra memory access. for (size_t i = 0; i < numIterations; ++i) { // 12 floating-point values src01 = _mm_loadu_ps(pSrc1 + 0); src02 = _mm_loadu_ps(pSrc2 + 0); src11 = _mm_loadu_ps(pSrc1 + 4); src12 = _mm_loadu_ps(pSrc2 + 4); src21 = _mm_loadu_ps(pSrc1 + 8); src22 = _mm_loadu_ps(pSrc2 + 8); pSrc1 += 12; pSrc2 += 12; dst0 = __MM_LERP_PS(t4, src01, src02); dst1 = __MM_LERP_PS(t4, src11, src12); dst2 = __MM_LERP_PS(t4, src21, src22); _mm_storeu_ps(pDst + 0, dst0); _mm_storeu_ps(pDst + 4, dst1); _mm_storeu_ps(pDst + 8, dst2); pDst += 12; } // Morph remaining vertices switch (numVertices) { case 3: // 9 floating-point values src01 = _mm_loadu_ps(pSrc1 + 0); src02 = _mm_loadu_ps(pSrc2 + 0); src11 = _mm_loadu_ps(pSrc1 + 4); src12 = _mm_loadu_ps(pSrc2 + 4); src21 = _mm_load_ss(pSrc1 + 8); src22 = _mm_load_ss(pSrc2 + 8); dst0 = __MM_LERP_PS(t4, src01, src02); dst1 = __MM_LERP_PS(t4, src11, src12); dst2 = __MM_LERP_SS(t4, src21, src22); _mm_storeu_ps(pDst + 0, dst0); _mm_storeu_ps(pDst + 4, dst1); _mm_store_ss(pDst + 8, dst2); break; case 2: // 6 floating-point values src01 = _mm_loadu_ps(pSrc1 + 0); src02 = _mm_loadu_ps(pSrc2 + 0); src11 = _mm_loadl_pi(t4, (__m64*)(pSrc1 + 4)); // t4 is meaningless here src12 = _mm_loadl_pi(t4, (__m64*)(pSrc2 + 4)); // t4 is meaningless here dst0 = __MM_LERP_PS(t4, src01, src02); dst1 = __MM_LERP_PS(t4, src11, src12); _mm_storeu_ps(pDst + 0, dst0); _mm_storel_pi((__m64*)(pDst + 4), dst1); break; case 1: // 3 floating-point values src01 = _mm_load_ss(pSrc1 + 2); src02 = _mm_load_ss(pSrc2 + 2); src01 = _mm_loadh_pi(src01, (__m64*)(pSrc1 + 0)); src02 = _mm_loadh_pi(src02, (__m64*)(pSrc2 + 0)); dst0 = __MM_LERP_PS(t4, src01, src02); _mm_storeh_pi((__m64*)(pDst + 0), dst0); _mm_store_ss(pDst + 2, dst0); break; } } } //--------------------------------------------------------------------- void OptimisedUtilSSE::concatenateAffineMatrices( const Matrix4& baseMatrix, const Matrix4* pSrcMat, Matrix4* pDstMat, size_t numMatrices) { __OGRE_CHECK_STACK_ALIGNED_FOR_SSE(); assert(_isAlignedForSSE(pSrcMat)); assert(_isAlignedForSSE(pDstMat)); // Load base matrix, unaligned __m128 m0 = _mm_loadu_ps(baseMatrix[0]); __m128 m1 = _mm_loadu_ps(baseMatrix[1]); __m128 m2 = _mm_loadu_ps(baseMatrix[2]); __m128 m3 = _mm_loadu_ps(baseMatrix[3]); // m3 should be equal to (0, 0, 0, 1) for (size_t i = 0; i < numMatrices; ++i) { // Load source matrix, aligned __m128 s0 = __MM_LOAD_PS((*pSrcMat)[0]); __m128 s1 = __MM_LOAD_PS((*pSrcMat)[1]); __m128 s2 = __MM_LOAD_PS((*pSrcMat)[2]); ++pSrcMat; __m128 t0, t1, t2, t3; // Concatenate matrix, and store results // Row 0 t0 = _mm_mul_ps(__MM_SELECT(m0, 0), s0); t1 = _mm_mul_ps(__MM_SELECT(m0, 1), s1); t2 = _mm_mul_ps(__MM_SELECT(m0, 2), s2); t3 = _mm_mul_ps(m0, m3); // Compiler should optimise this out of the loop __MM_STORE_PS((*pDstMat)[0], __MM_ACCUM4_PS(t0,t1,t2,t3)); // Row 1 t0 = _mm_mul_ps(__MM_SELECT(m1, 0), s0); t1 = _mm_mul_ps(__MM_SELECT(m1, 1), s1); t2 = _mm_mul_ps(__MM_SELECT(m1, 2), s2); t3 = _mm_mul_ps(m1, m3); // Compiler should optimise this out of the loop __MM_STORE_PS((*pDstMat)[1], __MM_ACCUM4_PS(t0,t1,t2,t3)); // Row 2 t0 = _mm_mul_ps(__MM_SELECT(m2, 0), s0); t1 = _mm_mul_ps(__MM_SELECT(m2, 1), s1); t2 = _mm_mul_ps(__MM_SELECT(m2, 2), s2); t3 = _mm_mul_ps(m2, m3); // Compiler should optimise this out of the loop __MM_STORE_PS((*pDstMat)[2], __MM_ACCUM4_PS(t0,t1,t2,t3)); // Row 3 __MM_STORE_PS((*pDstMat)[3], m3); ++pDstMat; } } //--------------------------------------------------------------------- void OptimisedUtilSSE::calculateFaceNormals( const float *positions, const EdgeData::Triangle *triangles, Vector4 *faceNormals, size_t numTriangles) { __OGRE_CHECK_STACK_ALIGNED_FOR_SSE(); assert(_isAlignedForSSE(faceNormals)); // Load Vector3 as: (x, 0, y, z) #define __LOAD_VECTOR3(p) _mm_loadh_pi(_mm_load_ss(p), (__m64*)((p)+1)) // Mask used to changes sign of single precision floating point values. OGRE_SIMD_ALIGNED_DECL(static const uint32, msSignMask[4]) = { 0x80000000, 0x80000000, 0x80000000, 0x80000000, }; size_t numIterations = numTriangles / 4; numTriangles &= 3; // Four triangles per-iteration for (size_t i = 0; i < numIterations; ++i) { // Load four Vector3 as: (x0, x1, x2, x3), (y0, y1, y2, y3), (z0, z1, z2, z3) #define __LOAD_FOUR_VECTOR3(x, y, z, p0, p1, p2, p3) \ { \ __m128 v0 = __LOAD_VECTOR3(p0); /* x0 -- y0 z0 */ \ __m128 v1 = __LOAD_VECTOR3(p1); /* x1 -- y1 z1 */ \ __m128 v2 = __LOAD_VECTOR3(p2); /* x2 -- y2 z2 */ \ __m128 v3 = __LOAD_VECTOR3(p3); /* x3 -- y3 z3 */ \ __m128 t0, t1; \ \ t0 = _mm_unpacklo_ps(v0, v2); /* x0 x2 -- -- */ \ t1 = _mm_unpacklo_ps(v1, v3); /* x1 x3 -- -- */ \ x = _mm_unpacklo_ps(t0, t1); /* x0 x1 x2 x3 */ \ \ t0 = _mm_unpackhi_ps(v0, v2); /* y0 y2 z0 z2 */ \ t1 = _mm_unpackhi_ps(v1, v3); /* y1 y3 z1 z3 */ \ y = _mm_unpacklo_ps(t0, t1); /* y0 y1 y2 y3 */ \ z = _mm_unpackhi_ps(t0, t1); /* z0 z1 z2 z3 */ \ } __m128 x0, x1, x2, y0, y1, y2, z0, z1, z2; // Load vertex 0 of four triangles, packed as component-major format: xxxx yyyy zzzz __LOAD_FOUR_VECTOR3(x0, y0, z0, positions + triangles[0].vertIndex[0] * 3, positions + triangles[1].vertIndex[0] * 3, positions + triangles[2].vertIndex[0] * 3, positions + triangles[3].vertIndex[0] * 3); // Load vertex 1 of four triangles, packed as component-major format: xxxx yyyy zzzz __LOAD_FOUR_VECTOR3(x1, y1, z1, positions + triangles[0].vertIndex[1] * 3, positions + triangles[1].vertIndex[1] * 3, positions + triangles[2].vertIndex[1] * 3, positions + triangles[3].vertIndex[1] * 3); // Load vertex 2 of four triangles, packed as component-major format: xxxx yyyy zzzz __LOAD_FOUR_VECTOR3(x2, y2, z2, positions + triangles[0].vertIndex[2] * 3, positions + triangles[1].vertIndex[2] * 3, positions + triangles[2].vertIndex[2] * 3, positions + triangles[3].vertIndex[2] * 3); triangles += 4; // Calculate triangle face normals // a = v1 - v0 __m128 ax = _mm_sub_ps(x1, x0); __m128 ay = _mm_sub_ps(y1, y0); __m128 az = _mm_sub_ps(z1, z0); // b = v2 - v0 __m128 bx = _mm_sub_ps(x2, x0); __m128 by = _mm_sub_ps(y2, y0); __m128 bz = _mm_sub_ps(z2, z0); // n = a cross b __m128 nx = _mm_sub_ps(_mm_mul_ps(ay, bz), _mm_mul_ps(az, by)); __m128 ny = _mm_sub_ps(_mm_mul_ps(az, bx), _mm_mul_ps(ax, bz)); __m128 nz = _mm_sub_ps(_mm_mul_ps(ax, by), _mm_mul_ps(ay, bx)); // w = - (n dot v0) __m128 nw = _mm_xor_ps( __MM_DOT3x3_PS(nx, ny, nz, x0, y0, z0), *(const __m128 *)&msSignMask); // Arrange to per-triangle face normal major format __MM_TRANSPOSE4x4_PS(nx, ny, nz, nw); // Store results __MM_STORE_PS(&faceNormals[0].x, nx); __MM_STORE_PS(&faceNormals[1].x, ny); __MM_STORE_PS(&faceNormals[2].x, nz); __MM_STORE_PS(&faceNormals[3].x, nw); faceNormals += 4; #undef __LOAD_FOUR_VECTOR3 } // Dealing with remaining triangles for (size_t j = 0; j < numTriangles; ++j) { // Load vertices of the triangle __m128 v0 = __LOAD_VECTOR3(positions + triangles->vertIndex[0] * 3); __m128 v1 = __LOAD_VECTOR3(positions + triangles->vertIndex[1] * 3); __m128 v2 = __LOAD_VECTOR3(positions + triangles->vertIndex[2] * 3); ++triangles; // Calculate face normal __m128 t0, t1; __m128 a = _mm_sub_ps(v1, v0); // ax 0 ay az __m128 b = _mm_sub_ps(v2, v0); // bx 0 by bz t0 = _mm_shuffle_ps(a, a, _MM_SHUFFLE(2,0,1,3)); // az 0 ax ay t1 = _mm_shuffle_ps(b, b, _MM_SHUFFLE(2,0,1,3)); // bz 0 bx by t0 = _mm_mul_ps(t0, b); // az*bx 0 ax*by ay*bz t1 = _mm_mul_ps(t1, a); // ax*bz 0 ay*bx az*by __m128 n = _mm_sub_ps(t0, t1); // ny 0 nz nx __m128 d = _mm_mul_ps( // dy 0 dz dx _mm_shuffle_ps(v0, v0, _MM_SHUFFLE(0,3,1,2)), n); n = _mm_sub_ps(_mm_sub_ps(_mm_sub_ps( // nx ny nz -(dx+dy+dz) _mm_shuffle_ps(n, n, _MM_SHUFFLE(1,2,0,3)), // nx ny nz 0 _mm_shuffle_ps(d, d, _MM_SHUFFLE(3,1,1,1))), // 0 0 0 dx _mm_shuffle_ps(d, d, _MM_SHUFFLE(0,1,1,1))), // 0 0 0 dy _mm_shuffle_ps(d, d, _MM_SHUFFLE(2,1,1,1))); // 0 0 0 dz // Store result __MM_STORE_PS(&faceNormals->x, n); ++faceNormals; } #undef __LOAD_VECTOR3 } //--------------------------------------------------------------------- void OptimisedUtilSSE::calculateLightFacing( const Vector4& lightPos, const Vector4* faceNormals, char* lightFacings, size_t numFaces) { __OGRE_CHECK_STACK_ALIGNED_FOR_SSE(); assert(_isAlignedForSSE(faceNormals)); // Map to convert 4-bits mask to 4 byte values static const char msMaskMapping[16][4] = { {0, 0, 0, 0}, {1, 0, 0, 0}, {0, 1, 0, 0}, {1, 1, 0, 0}, {0, 0, 1, 0}, {1, 0, 1, 0}, {0, 1, 1, 0}, {1, 1, 1, 0}, {0, 0, 0, 1}, {1, 0, 0, 1}, {0, 1, 0, 1}, {1, 1, 0, 1}, {0, 0, 1, 1}, {1, 0, 1, 1}, {0, 1, 1, 1}, {1, 1, 1, 1}, }; __m128 n0, n1, n2, n3; __m128 t0, t1; __m128 dp; int bitmask; // Load light vector, unaligned __m128 lp = _mm_loadu_ps(&lightPos.x); // Perload zero to register for compare dot product values __m128 zero = _mm_setzero_ps(); size_t numIterations = numFaces / 4; numFaces &= 3; // Four faces per-iteration for (size_t i = 0; i < numIterations; ++i) { // Load face normals, aligned n0 = __MM_LOAD_PS(&faceNormals[0].x); n1 = __MM_LOAD_PS(&faceNormals[1].x); n2 = __MM_LOAD_PS(&faceNormals[2].x); n3 = __MM_LOAD_PS(&faceNormals[3].x); faceNormals += 4; // Multiply by light vector n0 = _mm_mul_ps(n0, lp); // x0 y0 z0 w0 n1 = _mm_mul_ps(n1, lp); // x1 y1 z1 w1 n2 = _mm_mul_ps(n2, lp); // x2 y2 z2 w2 n3 = _mm_mul_ps(n3, lp); // x3 y3 z3 w3 // Horizontal add four vector values. t0 = _mm_add_ps( // x0+z0 x1+z1 y0+w0 y1+w1 _mm_unpacklo_ps(n0, n1), // x0 x1 y0 y1 _mm_unpackhi_ps(n0, n1)); // z0 z1 w0 w1 t1 = _mm_add_ps( // x2+z2 x3+z3 y2+w2 y3+w3 _mm_unpacklo_ps(n2, n3), // x2 x3 y2 y3 _mm_unpackhi_ps(n2, n3)); // z2 z3 w2 w3 dp = _mm_add_ps( // dp0 dp1 dp2 dp3 _mm_movelh_ps(t0, t1), // x0+z0 x1+z1 x2+z2 x3+z3 _mm_movehl_ps(t1, t0)); // y0+w0 y1+w1 y2+w2 y3+w3 // Compare greater than zero and setup 4-bits mask. Use '_mm_cmpnle_ps' // instead of '_mm_cmpgt_ps' here because we want keep 'zero' untouch, // i.e. it's 2nd operand of the assembly instruction. And in fact // '_mm_cmpgt_ps' was implemented as 'CMPLTPS' with operands swapped // in VC7.1. bitmask = _mm_movemask_ps(_mm_cmpnle_ps(dp, zero)); // Convert 4-bits mask to 4 bytes, and store results. *reinterpret_cast(lightFacings) = *reinterpret_cast(msMaskMapping[bitmask]); lightFacings += 4; } // Dealing with remaining faces switch (numFaces) { case 3: n0 = __MM_LOAD_PS(&faceNormals[0].x); n1 = __MM_LOAD_PS(&faceNormals[1].x); n2 = __MM_LOAD_PS(&faceNormals[2].x); n0 = _mm_mul_ps(n0, lp); // x0 y0 z0 w0 n1 = _mm_mul_ps(n1, lp); // x1 y1 z1 w1 n2 = _mm_mul_ps(n2, lp); // x2 y2 z2 w2 t0 = _mm_add_ps( // x0+z0 x1+z1 y0+w0 y1+w1 _mm_unpacklo_ps(n0, n1), // x0 x1 y0 y1 _mm_unpackhi_ps(n0, n1)); // z0 z1 w0 w1 t1 = _mm_add_ps( // x2+z2 x2+z2 y2+w2 y2+w2 _mm_unpacklo_ps(n2, n2), // x2 x2 y2 y2 _mm_unpackhi_ps(n2, n2)); // z2 z2 w2 w2 dp = _mm_add_ps( // dp0 dp1 dp2 dp2 _mm_movelh_ps(t0, t1), // x0+z0 x1+z1 x2+z2 x2+z2 _mm_movehl_ps(t1, t0)); // y0+w0 y1+w1 y2+w2 y2+w2 bitmask = _mm_movemask_ps(_mm_cmpnle_ps(dp, zero)); lightFacings[0] = msMaskMapping[bitmask][0]; lightFacings[1] = msMaskMapping[bitmask][1]; lightFacings[2] = msMaskMapping[bitmask][2]; break; case 2: n0 = __MM_LOAD_PS(&faceNormals[0].x); n1 = __MM_LOAD_PS(&faceNormals[1].x); n0 = _mm_mul_ps(n0, lp); // x0 y0 z0 w0 n1 = _mm_mul_ps(n1, lp); // x1 y1 z1 w1 t0 = _mm_add_ps( // x0+z0 x1+z1 y0+w0 y1+w1 _mm_unpacklo_ps(n0, n1), // x0 x1 y0 y1 _mm_unpackhi_ps(n0, n1)); // z0 z1 w0 w1 dp = _mm_add_ps( // dp0 dp1 dp0 dp1 _mm_movelh_ps(t0, t0), // x0+z0 x1+z1 x0+z0 x1+z1 _mm_movehl_ps(t0, t0)); // y0+w0 y1+w1 y0+w0 y1+w1 bitmask = _mm_movemask_ps(_mm_cmpnle_ps(dp, zero)); lightFacings[0] = msMaskMapping[bitmask][0]; lightFacings[1] = msMaskMapping[bitmask][1]; break; case 1: n0 = __MM_LOAD_PS(&faceNormals[0].x); n0 = _mm_mul_ps(n0, lp); // x0 y0 z0 w0 t0 = _mm_add_ps( // x0+z0 x0+z0 y0+w0 y0+w0 _mm_unpacklo_ps(n0, n0), // x0 x0 y0 y0 _mm_unpackhi_ps(n0, n0)); // z0 z0 w0 w0 dp = _mm_add_ps( // dp0 dp0 dp0 dp0 _mm_movelh_ps(t0, t0), // x0+z0 x0+z0 x0+z0 x0+z0 _mm_movehl_ps(t0, t0)); // y0+w0 y0+w0 y0+w0 y0+w0 bitmask = _mm_movemask_ps(_mm_cmpnle_ps(dp, zero)); lightFacings[0] = msMaskMapping[bitmask][0]; break; } } //--------------------------------------------------------------------- // Template to extrude vertices for directional light. template struct ExtrudeVertices_SSE_DirectionalLight { static void apply( const Vector4& lightPos, Real extrudeDist, const float* pSrcPos, float* pDestPos, size_t numVertices) { typedef SSEMemoryAccessor SrcAccessor; typedef SSEMemoryAccessor DestAccessor; // Directional light, extrusion is along light direction // Load light vector, unaligned __m128 lp = _mm_loadu_ps(&lightPos.x); // Calculate extrusion direction, note that we use inverted direction here // for eliminate an extra negative instruction, we'll compensate for that // by use subtract instruction instead later. __m128 tmp = _mm_mul_ps(lp, lp); tmp = _mm_add_ss(_mm_add_ss(tmp, _mm_shuffle_ps(tmp, tmp, 1)), _mm_movehl_ps(tmp, tmp)); // Looks like VC7.1 generate a bit inefficient code for 'rsqrtss', so use 'rsqrtps' instead tmp = _mm_mul_ss(_mm_rsqrt_ps(tmp), _mm_load_ss(&extrudeDist)); __m128 dir = _mm_mul_ps(lp, __MM_SELECT(tmp, 0)); // X Y Z - // Prepare extrude direction for extruding 4 vertices parallelly __m128 dir0 = _mm_shuffle_ps(dir, dir, _MM_SHUFFLE(0,2,1,0)); // X Y Z X __m128 dir1 = _mm_shuffle_ps(dir, dir, _MM_SHUFFLE(1,0,2,1)); // Y Z X Y __m128 dir2 = _mm_shuffle_ps(dir, dir, _MM_SHUFFLE(2,1,0,2)); // Z X Y Z __m128 s0, s1, s2; __m128 d0, d1, d2; size_t numIterations = numVertices / 4; numVertices &= 3; // Extruding 4 vertices per-iteration for (size_t i = 0; i < numIterations; ++i) { s0 = SrcAccessor::load(pSrcPos + 0); s1 = SrcAccessor::load(pSrcPos + 4); s2 = SrcAccessor::load(pSrcPos + 8); pSrcPos += 12; // The extrusion direction is inverted, use subtract instruction here d0 = _mm_sub_ps(s0, dir0); // X0 Y0 Z0 X1 d1 = _mm_sub_ps(s1, dir1); // Y1 Z1 X2 Y2 d2 = _mm_sub_ps(s2, dir2); // Z2 X3 Y3 Z3 DestAccessor::store(pDestPos + 0, d0); DestAccessor::store(pDestPos + 4, d1); DestAccessor::store(pDestPos + 8, d2); pDestPos += 12; } // Dealing with remaining vertices switch (numVertices) { case 3: // 9 floating-point values s0 = SrcAccessor::load(pSrcPos + 0); s1 = SrcAccessor::load(pSrcPos + 4); s2 = _mm_load_ss(pSrcPos + 8); // The extrusion direction is inverted, use subtract instruction here d0 = _mm_sub_ps(s0, dir0); // X0 Y0 Z0 X1 d1 = _mm_sub_ps(s1, dir1); // Y1 Z1 X2 Y2 d2 = _mm_sub_ss(s2, dir2); // Z2 -- -- -- DestAccessor::store(pDestPos + 0, d0); DestAccessor::store(pDestPos + 4, d1); _mm_store_ss(pDestPos + 8, d2); break; case 2: // 6 floating-point values s0 = SrcAccessor::load(pSrcPos + 0); s1 = _mm_loadl_pi(dir1, (__m64*)(pSrcPos + 4)); // dir1 is meaningless here // The extrusion direction is inverted, use subtract instruction here d0 = _mm_sub_ps(s0, dir0); // X0 Y0 Z0 X1 d1 = _mm_sub_ps(s1, dir1); // Y1 Z1 -- -- DestAccessor::store(pDestPos + 0, d0); _mm_storel_pi((__m64*)(pDestPos + 4), d1); break; case 1: // 3 floating-point values s0 = _mm_loadl_pi(dir0, (__m64*)(pSrcPos + 0)); // dir0 is meaningless here s1 = _mm_load_ss(pSrcPos + 2); // The extrusion direction is inverted, use subtract instruction here d0 = _mm_sub_ps(s0, dir0); // X0 Y0 -- -- d1 = _mm_sub_ss(s1, dir2); // Z0 -- -- -- _mm_storel_pi((__m64*)(pDestPos + 0), d0); _mm_store_ss(pDestPos + 2, d1); break; } } }; //--------------------------------------------------------------------- // Template to extrude vertices for point light. template struct ExtrudeVertices_SSE_PointLight { static void apply( const Vector4& lightPos, Real extrudeDist, const float* pSrcPos, float* pDestPos, size_t numVertices) { typedef SSEMemoryAccessor SrcAccessor; typedef SSEMemoryAccessor DestAccessor; // Point light, will calculate extrusion direction for every vertex // Load light vector, unaligned __m128 lp = _mm_loadu_ps(&lightPos.x); // Load extrude distance __m128 extrudeDist4 = _mm_load_ps1(&extrudeDist); size_t numIterations = numVertices / 4; numVertices &= 3; // Extruding 4 vertices per-iteration for (size_t i = 0; i < numIterations; ++i) { // Load source positions __m128 s0 = SrcAccessor::load(pSrcPos + 0); // x0 y0 z0 x1 __m128 s1 = SrcAccessor::load(pSrcPos + 4); // y1 z1 x2 y2 __m128 s2 = SrcAccessor::load(pSrcPos + 8); // z2 x3 y3 z3 pSrcPos += 12; // Arrange to 3x4 component-major for batches calculate __MM_TRANSPOSE4x3_PS(s0, s1, s2); // Calculate unnormalised extrusion direction __m128 dx = _mm_sub_ps(s0, __MM_SELECT(lp, 0)); // X0 X1 X2 X3 __m128 dy = _mm_sub_ps(s1, __MM_SELECT(lp, 1)); // Y0 Y1 Y2 Y3 __m128 dz = _mm_sub_ps(s2, __MM_SELECT(lp, 2)); // Z0 Z1 Z2 Z3 // Normalise extrusion direction and multiply by extrude distance __m128 tmp = __MM_DOT3x3_PS(dx, dy, dz, dx, dy, dz); tmp = _mm_mul_ps(_mm_rsqrt_ps(tmp), extrudeDist4); dx = _mm_mul_ps(dx, tmp); dy = _mm_mul_ps(dy, tmp); dz = _mm_mul_ps(dz, tmp); // Calculate extruded positions __m128 d0 = _mm_add_ps(dx, s0); __m128 d1 = _mm_add_ps(dy, s1); __m128 d2 = _mm_add_ps(dz, s2); // Arrange back to 4x3 continuous format for store results __MM_TRANSPOSE3x4_PS(d0, d1, d2); // Store extruded positions DestAccessor::store(pDestPos + 0, d0); DestAccessor::store(pDestPos + 4, d1); DestAccessor::store(pDestPos + 8, d2); pDestPos += 12; } // Dealing with remaining vertices for (size_t j = 0; j < numVertices; ++j) { // Load source position __m128 src = _mm_loadh_pi(_mm_load_ss(pSrcPos + 0), (__m64*)(pSrcPos + 1)); // x 0 y z pSrcPos += 3; // Calculate unnormalised extrusion direction __m128 dir = _mm_sub_ps(src, _mm_shuffle_ps(lp, lp, _MM_SHUFFLE(2,1,3,0))); // X 1 Y Z // Normalise extrusion direction and multiply by extrude distance __m128 tmp = _mm_mul_ps(dir, dir); tmp = _mm_add_ss(_mm_add_ss(tmp, _mm_movehl_ps(tmp, tmp)), _mm_shuffle_ps(tmp, tmp, 3)); // Looks like VC7.1 generate a bit inefficient code for 'rsqrtss', so use 'rsqrtps' instead tmp = _mm_mul_ss(_mm_rsqrt_ps(tmp), extrudeDist4); dir = _mm_mul_ps(dir, __MM_SELECT(tmp, 0)); // Calculate extruded position __m128 dst = _mm_add_ps(dir, src); // Store extruded position _mm_store_ss(pDestPos + 0, dst); _mm_storeh_pi((__m64*)(pDestPos + 1), dst); pDestPos += 3; } } }; //--------------------------------------------------------------------- void OptimisedUtilSSE::extrudeVertices( const Vector4& lightPos, Real extrudeDist, const float* pSrcPos, float* pDestPos, size_t numVertices) { __OGRE_CHECK_STACK_ALIGNED_FOR_SSE(); // Note: Since pDestPos is following tail of pSrcPos, we can't assume // it's aligned to SIMD alignment properly, so must check for it here. // // TODO: Add extra vertex to the vertex buffer for make sure pDestPos // aligned same as pSrcPos. // // We are use SSE reciprocal square root directly while calculating // extrusion direction, since precision loss not that important here. // if (lightPos.w == 0.0f) { if (_isAlignedForSSE(pSrcPos)) { if (_isAlignedForSSE(pDestPos)) ExtrudeVertices_SSE_DirectionalLight::apply( lightPos, extrudeDist, pSrcPos, pDestPos, numVertices); else ExtrudeVertices_SSE_DirectionalLight::apply( lightPos, extrudeDist, pSrcPos, pDestPos, numVertices); } else { if (_isAlignedForSSE(pDestPos)) ExtrudeVertices_SSE_DirectionalLight::apply( lightPos, extrudeDist, pSrcPos, pDestPos, numVertices); else ExtrudeVertices_SSE_DirectionalLight::apply( lightPos, extrudeDist, pSrcPos, pDestPos, numVertices); } } else { assert(lightPos.w == 1.0f); if (_isAlignedForSSE(pSrcPos)) { if (_isAlignedForSSE(pDestPos)) ExtrudeVertices_SSE_PointLight::apply( lightPos, extrudeDist, pSrcPos, pDestPos, numVertices); else ExtrudeVertices_SSE_PointLight::apply( lightPos, extrudeDist, pSrcPos, pDestPos, numVertices); } else { if (_isAlignedForSSE(pDestPos)) ExtrudeVertices_SSE_PointLight::apply( lightPos, extrudeDist, pSrcPos, pDestPos, numVertices); else ExtrudeVertices_SSE_PointLight::apply( lightPos, extrudeDist, pSrcPos, pDestPos, numVertices); } } } //--------------------------------------------------------------------- //--------------------------------------------------------------------- //--------------------------------------------------------------------- extern OptimisedUtil* _getOptimisedUtilSSE(void) { static OptimisedUtilSSE msOptimisedUtilSSE; #if defined(__OGRE_SIMD_ALIGN_STACK) static OptimisedUtilWithStackAlign msOptimisedUtilWithStackAlign(&msOptimisedUtilSSE); return &msOptimisedUtilWithStackAlign; #else return &msOptimisedUtilSSE; #endif } } #endif // __OGRE_HAVE_SSE