SIMD code for mesh normals calculation not working (Trying to convert C to SIMD)-CodePudding

C Code

void Mesh::RecalculateNormals()
{
        for (int i = 0; i < indexCount; i  = 3)
        {
            const int ia = indices[i];
            const int ib = indices[i   1];
            const int ic = indices[i   2];

            const glm::vec3 e1 = glm::vec3(vert[ia].position) - glm::vec3(vert[ib].position);
            const glm::vec3 e2 = glm::vec3(vert[ic].position) - glm::vec3(vert[ib].position);
            const glm::vec3 no = cross(e1, e2);

            vert[ia].normal  = glm::vec4(no, 0.0);
            vert[ib].normal  = glm::vec4(no, 0.0);
            vert[ic].normal  = glm::vec4(no, 0.0);
        }

        for (int i = 0; i < vertexCount; i  )
            vert[i].normal = glm::vec4(glm::normalize(glm::vec3(vert[i].normal)), 0.0f);

}

New Code with SIMD

// From :  https://geometrian.com/programming/tutorials/cross-product/index.php
[[nodiscard]] inline static __m128 cross_product(__m128 const& vec0, __m128 const& vec1) {
    __m128 tmp0 = _mm_shuffle_ps(vec0, vec0, _MM_SHUFFLE(3, 0, 2, 1));
    __m128 tmp1 = _mm_shuffle_ps(vec1, vec1, _MM_SHUFFLE(3, 1, 0, 2));
    __m128 tmp2 = _mm_mul_ps(tmp0, vec1);
    __m128 tmp3 = _mm_mul_ps(tmp0, tmp1);
    __m128 tmp4 = _mm_shuffle_ps(tmp2, tmp2, _MM_SHUFFLE(3, 0, 2, 1));
    return _mm_sub_ps(tmp3, tmp4);
}

void normalize(const glm::vec4& lpInput, glm::vec4& lpOutput) {
    const __m128& vInput = reinterpret_cast<const __m128&>(lpInput); // load input vector (x, y, z, a)
    __m128 vSquared = _mm_mul_ps(vInput, vInput); // square the input values
    __m128 vHalfSum = _mm_hadd_ps(vSquared, vSquared);
    __m128 vSum = _mm_hadd_ps(vHalfSum, vHalfSum); // compute the sum of values
    float fInvSqrt; _mm_store_ss(&fInvSqrt, _mm_rsqrt_ss(vSum)); // compute the inverse sqrt
    __m128 vNormalized = _mm_mul_ps(vInput, _mm_set1_ps(fInvSqrt)); // normalize the input vector
    lpOutput = reinterpret_cast<const glm::vec4&>(vNormalized); // store normalized vector (x, y, z, a)
}

void Mesh::RecalculateNormals()
{
        float result[4];
        glm::vec4 tmp;
        for (int i = 0; i < indexCount; i  = 3)
        {
            const int ia = indices[i];
            const int ib = indices[i   1];
            const int ic = indices[i   2];

            __m128 iav = _mm_set_ps(vert[ia].position.x, vert[ia].position.y, vert[ia].position.z, 0.0f);
            __m128 ibv = _mm_set_ps(vert[ib].position.x, vert[ib].position.y, vert[ib].position.z, 0.0f);
            __m128 icv = _mm_set_ps(vert[ic].position.x, vert[ic].position.y, vert[ic].position.z, 0.0f);

            __m128 e1i = _mm_sub_ps(iav, ibv);
            __m128 e2i = _mm_sub_ps(icv, ibv);

            //const glm::vec3 e1 = glm::vec3(vert[ia].position) - glm::vec3(vert[ib].position);
            //const glm::vec3 e2 = glm::vec3(vert[ic].position) - glm::vec3(vert[ib].position);
            //const glm::vec3 no = cross(e1, e2);

            __m128 no = cross_product(e1i, e2i);

            //vert[ia].normal  = glm::vec4(no, 0.0);
            //vert[ib].normal  = glm::vec4(no, 0.0);
            //vert[ic].normal  = glm::vec4(no, 0.0);
            _mm_storeu_ps(result, no);
            tmp = glm::make_vec4(result);
            vert[ia].normal  = tmp;
            vert[ib].normal  = tmp;
            vert[ic].normal  = tmp;
        }

        for (int i = 0; i < vertexCount; i  )
            vert[i].normal = glm::vec4(glm::normalize(glm::vec3(vert[i].normal)), 0.0f);

}

But its not working. Please can anyone help finding the problems.

(I am very new to SIMD)

CodePudding user response：

Your correctness problem is probably caused by Intel screwing up the order of their _mm_set_ps and similar intrinsics. To create a vector with x, y, z floats in the first 3 lanes, write either _mm_set_ps( 0, z, y, x ) or _mm_setr_ps( x, y, z, 0 ).

Here’s better primitive functions you gonna need for that. That code assumes you have at least SSE 4.1, that thing is introduced in the Intel Core 2 in 2007, and according to Steam survey the market penetration is 98.89% by now.

// Load an FP32 3D vector from memory
inline __m128 loadFloat3( const glm::vec3& pos )
{
    static_assert( sizeof( glm::vec3 ) == 12, "Expected to be 12 bytes (3 floats)" );

    __m128 low = _mm_castpd_ps( _mm_load_sd( (const double*)&pos ) );
    // Modern compilers are optimizing the following 2 intrinsics into a single insertps with a memory operand
    __m128 high = _mm_load_ss( ( (const float*)&pos )   2 );
    return _mm_insert_ps( low, high, 0x27 );
}

// Store an FP32 3D vector to memory
inline void storeFloat3( glm::vec3& pos, __m128 vec )
{
    _mm_store_sd( (double*)&pos, _mm_castps_pd( vec ) );
    ( (int*)( &pos ) )[ 2 ] = _mm_extract_ps( vec, 2 );
}

// Normalize a 3D vector; if the source is zero, will instead return a zero vector
inline __m128 vector3Normalize( __m128 vec )
{
    // Compute x^2   y^2   z^2, broadcast the result to all 4 lanes
    __m128 dp = _mm_dp_ps( vec, vec, 0b01111111 );
    // res = vec / sqrt( dp )
    __m128 res = _mm_div_ps( vec, _mm_sqrt_ps( dp ) );
    // Compare for dp > 0
    __m128 positiveLength = _mm_cmpgt_ps( dp, _mm_setzero_ps() );
    // Zero out the vector if the dot product was zero
    return _mm_and_ps( res, positiveLength );
}

// Copy-pasted from there: https://github.com/microsoft/DirectXMath/blob/jan2021/Inc/DirectXMathVector.inl#L9506-L9519
// MIT license
#ifdef __AVX__
#define XM_PERMUTE_PS( v, c ) _mm_permute_ps( v, c )
#else
#define XM_PERMUTE_PS( v, c ) _mm_shuffle_ps( v, v, c )
#endif

#ifdef __AVX2__
#define XM_FNMADD_PS( a, b, c ) _mm_fnmadd_ps((a), (b), (c))
#else
#define XM_FNMADD_PS( a, b, c ) _mm_sub_ps((c), _mm_mul_ps((a), (b)))
#endif

inline __m128 vector3Cross( __m128 V1, __m128 V2 )
{
    // y1,z1,x1,w1
    __m128 vTemp1 = XM_PERMUTE_PS( V1, _MM_SHUFFLE( 3, 0, 2, 1 ) );
    // z2,x2,y2,w2
    __m128 vTemp2 = XM_PERMUTE_PS( V2, _MM_SHUFFLE( 3, 1, 0, 2 ) );
    // Perform the left operation
    __m128 vResult = _mm_mul_ps( vTemp1, vTemp2 );
    // z1, x1, y1, w1
    vTemp1 = XM_PERMUTE_PS( vTemp1, _MM_SHUFFLE( 3, 0, 2, 1 ) );
    // y2, z2, x2, w2
    vTemp2 = XM_PERMUTE_PS( vTemp2, _MM_SHUFFLE( 3, 1, 0, 2 ) );
    // Perform the right operation
    vResult = XM_FNMADD_PS( vTemp1, vTemp2, vResult );
    return vResult;
}

You should not expect high quality results with that algorithm. It computes per-vertex normals by accumulating triangle normals weighted by triangle’s area. This means if your mesh has both small and large triangles, the normals of the small triangles will be ignored, which is suboptimal. A better way is weighting normals by triangle’s angles at vertices. See this article for more information.