I'm especially interested in ways to further optimize a constraint algorithm. So far I've switched to SIMD sse4 vector operations which nearly doubled the performance. I'm hoping there is more performance to squeeze out that I simply don't see.
Here is the algorithm:
FVector4 rest(REST_DISTANCE); // Set the constant simd vector outside the loop
FVector4 stiffness(STIFFNESS);
for (int i = 0; i < TEST_SIZE; i++)
{
const auto& constraint = Constraints[i];
auto& position1 = Particle[constraint.first].Position;
auto& position2 = Particle[constraint.second].Position;
for (int j = 0; j < ITERATION_COUNT; j++)
{
auto delta = position2 - position1;
auto distance = delta.norm();
auto correctionDistance = (distance - rest) / distance;
auto pCorrection = correctionDistance * delta * stiffness;
position1 += pCorrection;
position2 -= pCorrection;
}
}
Constraints
is an array of a simple struct that holds two unsigned int's, Particle
is an array of FParticle
structs which holds three FVector4
fields and a Fvector4
simply wraps a __m128
and provides operators and functions like norm()
which utilize _mm_*
intrinsic functions.
Currently the algorithm takes 55 cycles per Constraint per Iteration, this is with TEST_SIZE
of 150,000 and ITERATION_COUNT
at 16. I know there is performance to gain from making it parallel but I'm hoping to get more performance out of the single threaded version specifically.
I can post the actual structs if needed.
FVector4 code:
struct FVector4
{
FVector4()
{}
explicit FVector4(const __m128& InitData) : Data(InitData)
{
}
explicit FVector4(__m128&& InitData) : Data(InitData)
{
}
explicit FVector4(float InitValue)
{
Data = _mm_set_ps1(InitValue);
}
explicit FVector4(float X, float Y, float Z, float W = 0.f)
{
Data = _mm_setr_ps(X, Y, Z, W);
}
__forceinline float& operator[](int index)
{
return Data.m128_f32[index];
}
__forceinline const float& operator[](int index) const
{
return Data.m128_f32[index];
}
__forceinline __m128 squaredNorm() const
{
return _mm_dp_ps(Data, Data, 0x7F);
}
__forceinline __m128 norm() const
{
auto sqNorm = squaredNorm();
return _mm_mul_ps(sqNorm, _mm_rsqrt_ps(sqNorm));
}
__forceinline __m128 normalized() const
{
auto dp = _mm_dp_ps(Data, Data, 0x7F);
dp = _mm_rsqrt_ps(dp);
return _mm_mul_ps(Data, dp);
}
__forceinline void normalize()
{
Data = normalized();
}
__m128 Data;
};
Operators as used in the algorithm:
__forceinline FVector4 operator-(const FVector4& a, const FVector4& b)
{
return FVector4(_mm_sub_ps(a.Data, b.Data));
}
__forceinline FVector4 operator*(const FVector4& a, const FVector4& b)
{
return FVector4(_mm_mul_ps(a.Data, b.Data));
}
__forceinline FVector4 operator/(const FVector4& a, const FVector4& b)
{
return FVector4(_mm_mul_ps(a.Data, _mm_rcp_ps(b.Data)));
}
__forceinline FVector4& operator+=(FVector4& a, const FVector4& b)
{
a.Data = _mm_add_ps(a.Data, b.Data);
return a;
}
__forceinline FVector4& operator-=(FVector4& a, const FVector4& b)
{
a.Data = _mm_sub_ps(a.Data, b.Data);
return a;
}