2009年8月5日星期三

Restrict 指針

一般時候我們的編譯器都不知道兩個指針所指的位置是否相同或者有所重疊,大大降低了它的優化效果。就拿 Matrix3 * Vector3 為例:

void Matrix::MulVector(const Vec3& v, Vec3& result) {
result.x = m00 * v.x + m01 * v.y + m02 * v.z;
result.y = m10 * v.x + m11 * v.y + m12 * v.z;
result.z = m20 * v.x + m21 * v.y + m22 * v.z;
}

很簡單的三行代碼,然而它隱藏了一個效能上的問題。堂 result 被賦予了新的值後,編譯器認為 v 的值也可能被更改了 (v 和 result 也是 reference,pointer 的一類)。因此原本可以留在 register 裡的 v.x, v.y 和 v.z 被迫從新由記憶體裡閱讀回來。看看VC2008 編譯成的機器碼吧:

result.x = m00 * v.x + m01 * v.y + m02 * v.z;
mov eax,dword ptr [esp+4]
movss xmm0,dword ptr [ecx+8]
mulss xmm0,dword ptr [eax+8]
movss xmm1,dword ptr [ecx+4]
mulss xmm1,dword ptr [eax+4]
mov edx,dword ptr [esp+8]
addss xmm0,xmm1
movss xmm1,dword ptr [eax]
mulss xmm1,dword ptr [ecx]
addss xmm0,xmm1
movss dword ptr [edx],xmm0

result.y = m10 * v.x + m11 * v.y + m12 * v.z;
movss xmm0,dword ptr [ecx+0Ch]
mulss xmm0,dword ptr [eax]
movss xmm1,dword ptr [ecx+14h]
mulss xmm1,dword ptr [eax+8]
addss xmm0,xmm1
movss xmm1,dword ptr [ecx+10h]
mulss xmm1,dword ptr [eax+4]
addss xmm0,xmm1
movss dword ptr [edx+4],xmm0

result.z = m20 * v.x + m21 * v.y + m22 * v.z;
movss xmm0,dword ptr [ecx+18h]
mulss xmm0,dword ptr [eax]
movss xmm1,dword ptr [ecx+20h]
mulss xmm1,dword ptr [eax+8]
addss xmm0,xmm1
movss xmm1,dword ptr [ecx+1Ch]
mulss xmm1,dword ptr [eax+4]
addss xmm0,xmm1
movss dword ptr [edx+8],xmm0

這時候使用 restrict 就可幫上編譯器把。請注意,VC2008 的 __restrict 只對指針生效:

void Matrix::MulVector(const Vec3& v_, Vec3& result_) {
const Vec3* __restrict v = &v_;
Vec3* __restrict ret = &result_;
result->x = m00 * v->x + m01 * v->y + m02 * v->z;
result->y = m10 * v->x + m11 * v->y + m12 * v->z;
result->z = m20 * v->x + m21 * v->y + m22 * v->z;
}

從新編譯後的機器碼:

result->x = m00 * v->x + m01 * v->y + m02 * v->z;
mov eax,dword ptr [esp+4]
movss xmm1,dword ptr [eax+4]
movss xmm0,dword ptr [eax+8]
movss xmm2,dword ptr [eax]
movss xmm3,dword ptr [ecx+4]
movss xmm4,dword ptr [ecx+8]
mov eax,dword ptr [esp+8]
mulss xmm3,xmm1
mulss xmm4,xmm0
addss xmm3,xmm4
movaps xmm4,xmm2
mulss xmm4,dword ptr [ecx]
addss xmm3,xmm4

result->y = m10 * v->x + m11 * v->y + m12 * v->z;
movss xmm4,dword ptr [ecx+10h]
movss dword ptr [eax],xmm3
movss xmm3,dword ptr [ecx+0Ch]
mulss xmm3,xmm2
mulss xmm4,xmm1
addss xmm3,xmm4
movss xmm4,dword ptr [ecx+14h]
mulss xmm4,xmm0
addss xmm3,xmm4
movss dword ptr [eax+4],xmm3

result->z = m20 * v->x + m21 * v->y + m22 * v->z;
movss xmm3,dword ptr [ecx+18h]
mulss xmm3,xmm2
movss xmm2,dword ptr [ecx+1Ch]
mulss xmm2,xmm1
movss xmm1,dword ptr [ecx+20h]
addss xmm3,xmm2
mulss xmm1,xmm0
addss xmm3,xmm1
movss dword ptr [eax+8],xmm3

可以看到記憶體閱讀操作減少了。

到最後,其實使用局部變量也可達到類似效果:

void Matrix::MulVector(const Vec3& v, Vec3& result) {
const float x = v.x;
const float y = v.y;
const float z = v.z;

result.x = m00 * x + m01 * y + m02 * z;
result.y = m10 * x + m11 * y + m12 * z;
result.z = m20 * x + m21 * y + m22 * z;
}

1 則留言: