Difference between revisions of "Canonical Stereo Code"

From Bo3b's School for Shaderhackers
Jump to: navigation, search
(Matrix Inversion: Spotted an error in the generated asm of my euclidean matrix inverse)
(Prime Directive)
 
(One intermediate revision by one user not shown)
Line 78: Line 78:
 
mul r30.z, r30.x, r30.w
 
mul r30.z, r30.x, r30.w
 
add r0.x, r0.x, r30.z
 
add r0.x, r0.x, r30.z
 +
</syntaxhighlight>
 +
 +
 +
Slightly different variant needed for SM 4 and SM 5 shaders.
 +
 +
<syntaxhighlight lang="c">
 +
...
 +
// 3DMigoto StereoParams in dcl section
 +
dcl_resource_texture2d (float,float,float,float) t125
 +
...
 +
 +
// Load r10 with stereo params
 +
ld_indexable(texture2d)(float,float,float,float) r10.xyzw, l(0, 0, 0, 0), t125.xyzw
 +
 +
// Temp output because we cannot use o0 as a register here
 +
//add o0.xyzw, r0.xyzw, cb0[19].xyzw  example shader output
 +
add r9.xyzw, r0.xyzw, cb0[19].xyzw
 +
 +
// (W - Convergence)
 +
add r10.w, r9.w, -r10.y
 +
 +
// multiply that times Separation for:
 +
//  Separation * (W - Convergence)
 +
mul r10.z, r10.x, r10.w
 +
 +
// Add that to Xold for the complete:
 +
//  Xnew = Xold + Separation * (W - Convergence)
 +
add r9.x, r9.x, r10.z
 +
 +
// Assign temp to output
 +
mov o0.xyzw, r9.xyzw
 +
...
 
</syntaxhighlight>
 
</syntaxhighlight>
  
Line 186: Line 218:
  
  
''''' ASM (3Dmigoto ASM) '''''
+
''''' ASM (DX9 ASM) '''''
  
 
This one hasn't been fully tested, but should work.  This is created by ''mx-2'' and found on his [https://github.com/mx-2/3d-fix/blob/master/_tools_/inverseMatrix.asm GitHub]
 
This one hasn't been fully tested, but should work.  This is created by ''mx-2'' and found on his [https://github.com/mx-2/3d-fix/blob/master/_tools_/inverseMatrix.asm GitHub]

Latest revision as of 17:31, 24 December 2023

Prime Directive

This is the prime directive code in ASM for DX9 and HelixMod. And the HLSL version for 3Dmigoto. In all cases, we are implementing the NVidia specified formula of:

clipPos.x += EyeSign * Separation * ( clipPos.w – Convergence )


ASM (both HelixMod and 3Dmigoto ASM)

This is the wordy version with extra comments to make it more clear what is happening.

...
// The required constant for txldl in c200.z
def c200, 1.0, 600, 0.0625, 0
// Sampler used to fetch stereo params, 
// s0 sampler is default for VS
dcl_2d s0
...
 
// At this point r0 is the output position, correctly
// placed, but without stereo.
 
// To create stereo effects, we need to calculate:
//  Xnew = Xold + Separation * (W - Convergence)
 
// Fetch the Separation (r30.x) and Convergence (r30.y)  
// using the Helix NVapi trick
texldl r30, c200.z, s0
 
// (W - Convergence)
add r30.w, r0.w, -r30.y
 
// multiply that times Separation for:
//   Separation * (W - Convergence)
mul r30.z, r30.x, r30.w
 
// Add that to Xold for the complete:
//  Xold + Separation * (W - Convergence)
add r0.x, r0.x, r30.z


Another variant that is more concise, but less clear. As you get more accustomed to seeing this sequence of code, or are sharing with an expert crowd, it's less necessary to fully document this part, as it is always the same sequence and easy to recognize because of the texldl of 0.0625.

...
// Stereo correction constants
def c200, 1.0, 600, 0.0625, 0
dcl_2d s0
...
 
// At this point r0 is the output position, correctly
// placed, but without stereo.
 
// To create stereo effects, we need to calculate:
//  Xnew = Xold + Separation * (W - Convergence)
 
texldl r30, c200.z, s0
add r30.w, r0.w, -r30.y
mad r0.x, r30.x, r30.w, r0.x


Another very common variant you'll see in HelixMod fixes is the four line version, with no comments. This is less optimal than the two above, but is worth seeing to be able to recognize it.

...
// Stereo correction constants
def c200, 1.0, 600, 0.0625, 0
dcl_2d s0
...
 
// At this point r0 is the output position, correctly
// placed, but without stereo.
texldl r30, c200.z, s0
add r30.w, r0.w, -r30.y
mul r30.z, r30.x, r30.w
add r0.x, r0.x, r30.z


Slightly different variant needed for SM 4 and SM 5 shaders.

...
// 3DMigoto StereoParams in dcl section
dcl_resource_texture2d (float,float,float,float) t125
...
 
// Load r10 with stereo params
ld_indexable(texture2d)(float,float,float,float) r10.xyzw, l(0, 0, 0, 0), t125.xyzw
 
// Temp output because we cannot use o0 as a register here
//add o0.xyzw, r0.xyzw, cb0[19].xyzw  example shader output
add r9.xyzw, r0.xyzw, cb0[19].xyzw
 
// (W - Convergence)
add r10.w, r9.w, -r10.y
 
// multiply that times Separation for:
//   Separation * (W - Convergence)
mul r10.z, r10.x, r10.w
 
// Add that to Xold for the complete:
//  Xnew = Xold + Separation * (W - Convergence)
add r9.x, r9.x, r10.z
 
// Assign temp to output
mov o0.xyzw, r9.xyzw
...


HLSL (3Dmigoto only)

The HLSL version is very similar, but since it's a compile language there is no need to be terse when writing the code. We can make it "self-documenting" by using good variable names.

...
float separation = StereoParams.Load(0).x;
float convergence = StereoParams.Load(0).y;
 
// At this point, "location" is the output position, but missing stereo.
 
location.x += separation * (location.w - convergence);



Matrix Inversion

We often need to invert a ViewProjectionMatrix, in order to be able to stereo correct a location in the proper viewspace (usually projection space for deferred rendering).

We only need to do this in cases where the inverted matrix is not available. A lot of games have both matrices available, and can be used directly.

We have code samples for both ASM, and for HLSL. It's worth noting that for HelixMod fixes in DX9, that HelixMod already supports inverted matrices directly, and this code should not be used there, because all this extra code will impact performance. Try to avoid using it in PS in particular, because inverting a matrix for every pixel is very costly.


HLSL (3Dmigoto only)

The best way to invert a matrix is to use the runtime shader feature from d3dx.ini. This is best because it runs once per frame, which keeps a performance impact low. This was created by DarkStarSword.

Described here The actual shader is found here And, how to use it, described here


If that doesn't work for some reason (not all games are compatible), it's possible to do it in HLSL directly. This example is backwards in that it starts with an inverted matrix, then creates the forward matrix from that. Used in an example here Again, it's worth noting that doing this in a PS shader is sub-optimal.


This version is found in matrix.hlsl, created by DarkStarSword. It can be added into the header of any HLSL file, or #included.

An original can be found here

matrix inverse(matrix m)
{
	matrix inv;
 
	float det = determinant(m);
	inv[0].x = m[1].y*(m[2].z*m[3].w - m[2].w*m[3].z) + m[1].z*(m[2].w*m[3].y - m[2].y*m[3].w) + m[1].w*(m[2].y*m[3].z - m[2].z*m[3].y);
	inv[0].y = m[0].y*(m[2].w*m[3].z - m[2].z*m[3].w) + m[0].z*(m[2].y*m[3].w - m[2].w*m[3].y) + m[0].w*(m[2].z*m[3].y - m[2].y*m[3].z);
	inv[0].z = m[0].y*(m[1].z*m[3].w - m[1].w*m[3].z) + m[0].z*(m[1].w*m[3].y - m[1].y*m[3].w) + m[0].w*(m[1].y*m[3].z - m[1].z*m[3].y);
	inv[0].w = m[0].y*(m[1].w*m[2].z - m[1].z*m[2].w) + m[0].z*(m[1].y*m[2].w - m[1].w*m[2].y) + m[0].w*(m[1].z*m[2].y - m[1].y*m[2].z);
	inv[1].x = m[1].x*(m[2].w*m[3].z - m[2].z*m[3].w) + m[1].z*(m[2].x*m[3].w - m[2].w*m[3].x) + m[1].w*(m[2].z*m[3].x - m[2].x*m[3].z);
	inv[1].y = m[0].x*(m[2].z*m[3].w - m[2].w*m[3].z) + m[0].z*(m[2].w*m[3].x - m[2].x*m[3].w) + m[0].w*(m[2].x*m[3].z - m[2].z*m[3].x);
	inv[1].z = m[0].x*(m[1].w*m[3].z - m[1].z*m[3].w) + m[0].z*(m[1].x*m[3].w - m[1].w*m[3].x) + m[0].w*(m[1].z*m[3].x - m[1].x*m[3].z);
	inv[1].w = m[0].x*(m[1].z*m[2].w - m[1].w*m[2].z) + m[0].z*(m[1].w*m[2].x - m[1].x*m[2].w) + m[0].w*(m[1].x*m[2].z - m[1].z*m[2].x);
	inv[2].x = m[1].x*(m[2].y*m[3].w - m[2].w*m[3].y) + m[1].y*(m[2].w*m[3].x - m[2].x*m[3].w) + m[1].w*(m[2].x*m[3].y - m[2].y*m[3].x);
	inv[2].y = m[0].x*(m[2].w*m[3].y - m[2].y*m[3].w) + m[0].y*(m[2].x*m[3].w - m[2].w*m[3].x) + m[0].w*(m[2].y*m[3].x - m[2].x*m[3].y);
	inv[2].z = m[0].x*(m[1].y*m[3].w - m[1].w*m[3].y) + m[0].y*(m[1].w*m[3].x - m[1].x*m[3].w) + m[0].w*(m[1].x*m[3].y - m[1].y*m[3].x);
	inv[2].w = m[0].x*(m[1].w*m[2].y - m[1].y*m[2].w) + m[0].y*(m[1].x*m[2].w - m[1].w*m[2].x) + m[0].w*(m[1].y*m[2].x - m[1].x*m[2].y);
	inv[3].x = m[1].x*(m[2].z*m[3].y - m[2].y*m[3].z) + m[1].y*(m[2].x*m[3].z - m[2].z*m[3].x) + m[1].z*(m[2].y*m[3].x - m[2].x*m[3].y);
	inv[3].y = m[0].x*(m[2].y*m[3].z - m[2].z*m[3].y) + m[0].y*(m[2].z*m[3].x - m[2].x*m[3].z) + m[0].z*(m[2].x*m[3].y - m[2].y*m[3].x);
	inv[3].z = m[0].x*(m[1].z*m[3].y - m[1].y*m[3].z) + m[0].y*(m[1].x*m[3].z - m[1].z*m[3].x) + m[0].z*(m[1].y*m[3].x - m[1].x*m[3].y);
	inv[3].w = m[0].x*(m[1].y*m[2].z - m[1].z*m[2].y) + m[0].y*(m[1].z*m[2].x - m[1].x*m[2].z) + m[0].z*(m[1].x*m[2].y - m[1].y*m[2].x);
	inv /= det;
 
	return inv;
}
 
matrix inverse(float4 m0, float4 m1, float4 m2, float4 m3)
{
	return inverse(matrix(m0, m1, m2, m3));
}
 
#define MATRIX(cb, idx) matrix(cb[idx], cb[idx+1], cb[idx+2], cb[idx+3])



The original version of this came from Mike_ar69.

...
matrix ivp, vp;
ivp = matrix(InstanceConsts[1], InstanceConsts[2], InstanceConsts[3], InstanceConsts[4]);
 
// Work out the view-projection matrix from it's inverse:
vp[0].x = ivp[1].y*(ivp[2].z*ivp[3].w - ivp[2].w*ivp[3].z) + ivp[1].z*(ivp[2].w*ivp[3].y - ivp[2].y*ivp[3].w) + ivp[1].w*(ivp[2].y*ivp[3].z - ivp[2].z*ivp[3].y);
vp[0].y = ivp[0].y*(ivp[2].w*ivp[3].z - ivp[2].z*ivp[3].w) + ivp[0].z*(ivp[2].y*ivp[3].w - ivp[2].w*ivp[3].y) + ivp[0].w*(ivp[2].z*ivp[3].y - ivp[2].y*ivp[3].z);
vp[0].z = ivp[0].y*(ivp[1].z*ivp[3].w - ivp[1].w*ivp[3].z) + ivp[0].z*(ivp[1].w*ivp[3].y - ivp[1].y*ivp[3].w) + ivp[0].w*(ivp[1].y*ivp[3].z - ivp[1].z*ivp[3].y);
vp[0].w = ivp[0].y*(ivp[1].w*ivp[2].z - ivp[1].z*ivp[2].w) + ivp[0].z*(ivp[1].y*ivp[2].w - ivp[1].w*ivp[2].y) + ivp[0].w*(ivp[1].z*ivp[2].y - ivp[1].y*ivp[2].z);
vp[1].x = ivp[1].x*(ivp[2].w*ivp[3].z - ivp[2].z*ivp[3].w) + ivp[1].z*(ivp[2].x*ivp[3].w - ivp[2].w*ivp[3].x) + ivp[1].w*(ivp[2].z*ivp[3].x - ivp[2].x*ivp[3].z);
vp[1].y = ivp[0].x*(ivp[2].z*ivp[3].w - ivp[2].w*ivp[3].z) + ivp[0].z*(ivp[2].w*ivp[3].x - ivp[2].x*ivp[3].w) + ivp[0].w*(ivp[2].x*ivp[3].z - ivp[2].z*ivp[3].x);
vp[1].z = ivp[0].x*(ivp[1].w*ivp[3].z - ivp[1].z*ivp[3].w) + ivp[0].z*(ivp[1].x*ivp[3].w - ivp[1].w*ivp[3].x) + ivp[0].w*(ivp[1].z*ivp[3].x - ivp[1].x*ivp[3].z);
vp[1].w = ivp[0].x*(ivp[1].z*ivp[2].w - ivp[1].w*ivp[2].z) + ivp[0].z*(ivp[1].w*ivp[2].x - ivp[1].x*ivp[2].w) + ivp[0].w*(ivp[1].x*ivp[2].z - ivp[1].z*ivp[2].x);
vp[2].x = ivp[1].x*(ivp[2].y*ivp[3].w - ivp[2].w*ivp[3].y) + ivp[1].y*(ivp[2].w*ivp[3].x - ivp[2].x*ivp[3].w) + ivp[1].w*(ivp[2].x*ivp[3].y - ivp[2].y*ivp[3].x);
vp[2].y = ivp[0].x*(ivp[2].w*ivp[3].y - ivp[2].y*ivp[3].w) + ivp[0].y*(ivp[2].x*ivp[3].w - ivp[2].w*ivp[3].x) + ivp[0].w*(ivp[2].y*ivp[3].x - ivp[2].x*ivp[3].y);
vp[2].z = ivp[0].x*(ivp[1].y*ivp[3].w - ivp[1].w*ivp[3].y) + ivp[0].y*(ivp[1].w*ivp[3].x - ivp[1].x*ivp[3].w) + ivp[0].w*(ivp[1].x*ivp[3].y - ivp[1].y*ivp[3].x);
vp[2].w = ivp[0].x*(ivp[1].w*ivp[2].y - ivp[1].y*ivp[2].w) + ivp[0].y*(ivp[1].x*ivp[2].w - ivp[1].w*ivp[2].x) + ivp[0].w*(ivp[1].y*ivp[2].x - ivp[1].x*ivp[2].y);
vp[3].x = ivp[1].x*(ivp[2].z*ivp[3].y - ivp[2].y*ivp[3].z) + ivp[1].y*(ivp[2].x*ivp[3].z - ivp[2].z*ivp[3].x) + ivp[1].z*(ivp[2].y*ivp[3].x - ivp[2].x*ivp[3].y);
vp[3].y = ivp[0].x*(ivp[2].y*ivp[3].z - ivp[2].z*ivp[3].y) + ivp[0].y*(ivp[2].z*ivp[3].x - ivp[2].x*ivp[3].z) + ivp[0].z*(ivp[2].x*ivp[3].y - ivp[2].y*ivp[3].x);
vp[3].z = ivp[0].x*(ivp[1].z*ivp[3].y - ivp[1].y*ivp[3].z) + ivp[0].y*(ivp[1].x*ivp[3].z - ivp[1].z*ivp[3].x) + ivp[0].z*(ivp[1].y*ivp[3].x - ivp[1].x*ivp[3].y);
vp[3].w = ivp[0].x*(ivp[1].y*ivp[2].z - ivp[1].z*ivp[2].y) + ivp[0].y*(ivp[1].z*ivp[2].x - ivp[1].x*ivp[2].z) + ivp[0].z*(ivp[1].x*ivp[2].y - ivp[1].y*ivp[2].x);
vp /= determinant(ivp);
...


ASM (DX9 ASM)

This one hasn't been fully tested, but should work. This is created by mx-2 and found on his GitHub

...
//
// inverseMatrix.asm
//
// Matrix inversion with Gauss-Jordan elimination
// algorithm on GPU.
//
// This algorithm uses 128 instructions, from which
// 83 (best case) to 110 (worst case) are executed.
//
// input matrix is in r0-r3
// output will be in r4-r7
// r8, r9 are used as temporary registers
// c200 = (1,0,0,0) is required
//
// r0.x r0.y r0.z r0.w | r4.x, r4.y, r4.z, r4.w
// r1.x r1.y r1.z r1.w | r5.x, r5.y, r5.z, r5.w
// r2.x r2.y r2.z r2.w | r6.x, r6.y, r6.z, r6.w
// r3.x r3.y r3.z r3.w | r7.x, r7.y, r7.z, r7.w
//
// Test:
// SETR r0, 0, 2, 3, 4
// SETR r1, 1, 0, 5, 4
// SETR r2,-1, 2, 3, 4
// SETR r3, 0, 2, 3, 5
//
// Should produce
// r4 =  1.0,  0.0, -1.0,  0.0
// r5 =  1.6, -0.3, -0.3, -0.8
// r6 =  0.6,  0.2,  0.2, -0.8
// r7 = -1.0,  0.0,  0.0,  1.0
//
 
// Init registers
def c200, 1, 0, 0, 0
mov r4, c200.xyzw
mov r5, c200.wxyz
mov r6, c200.zwxy
mov r7, c200.yzwx
 
// Pivot first column
mov r8, c200
if_eq r0.x, r8.y
  if_eq r1.x, r8.y
    if_eq r2.x, r8.y
      mov r9, r0
      mov r0, r3
      mov r3, r9
      mov r9, r4
      mov r4, r7
      mov r7, r9
    else
      mov r9, r0
      mov r0, r2
      mov r2, r9
      mov r9, r4
      mov r4, r6
      mov r6, r9
    endif
  else
    mov r9, r0
    mov r0, r1
    mov r1, r9
    mov r9, r4
    mov r4, r5
    mov r5, r9
  endif
endif
 
// First column
rcp r8.x, r0.x
mul r8.y, r8.x, r1.x
mul r9, r0, r8.y
add r1, r1, -r9
mul r9, r4, r8.y
add r5, r5, -r9
 
mul r8.y, r8.x, r2.x
mul r9, r0, r8.y
add r2, r2, -r9
mul r9, r4, r8.y
add r6, r6, -r9
 
mul r8.y, r8.x, r3.x
mul r9, r0, r8.y
add r3, r3, -r9
mul r9, r4, r8.y
add r7, r7, -r9
 
// Pivot second column
mov r8, c200
if_eq r1.y, r8.y
  if_eq r2.y, r8.y
    mov r9, r1
    mov r1, r3
    mov r3, r9
    mov r9, r5
    mov r5, r7
    mov r7, r9
  else
    mov r9, r1
    mov r1, r2
    mov r2, r9
    mov r9, r5
    mov r5, r6
    mov r6, r9
  endif
endif
 
// Second column
rcp r8.x, r1.y
mul r8.y, r8.x, r2.y
mul r9, r1, r8.y
add r2, r2, -r9
mul r9, r5, r8.y
add r6, r6, -r9
 
mul r8.y, r8.x, r3.y
mul r9, r1, r8.y
add r3, r3, -r9
mul r9, r5, r8.y
add r7, r7, -r9
 
// Pivot third column
mov r8, c200
if_eq r2.z, r8.y
  mov r9, r2
  mov r2, r3
  mov r3, r9
  mov r9, r6
  mov r6, r7
  mov r7, r9
endif
 
// Third column
rcp r8.x, r2.z
mul r8.y, r8.x, r3.z
mul r9, r2, r8.y
add r3, r3, -r9
mul r9, r6, r8.y
add r7, r7, -r9
 
// Normalize r3.w
rcp r8.x, r3.w
mul r3, r3, r8.x
mul r7, r7, r8.x
 
// Fourth column
mul r8, r3, r2.w
mul r9, r7, r2.w
add r2, r2, -r8
add r6, r6, -r9
 
mul r8, r3, r1.w
mul r9, r7, r1.w
add r1, r1, -r8
add r5, r5, -r9
 
mul r8, r3, r0.w
mul r9, r7, r0.w
add r0, r0, -r8
add r4, r4, -r9
 
// Normalize r2.z
rcp r8.x, r2.z
mul r2, r2, r8.x
mul r6, r6, r8.x
 
// Third column (upper part)
mul r8, r2, r1.z
mul r9, r6, r1.z
add r1, r1, -r8
add r5, r5, -r9
 
mul r8, r2, r0.z
mul r9, r6, r0.z
add r0, r0, -r8
add r4, r4, -r9
 
// Normalize r1.y
rcp r8.x, r1.y
mul r1, r1, r8.x
mul r5, r5, r8.x
 
// Second column (upper part)
mul r8, r1, r0.y
mul r9, r5, r0.y
add r0, r0, -r8
add r4, r4, -r9
 
// Normalize first column
rcp r8.x, r0.x
mul r0, r0, r8.x
mul r4, r4, r8.x
...

DX9 ASM (Euclidean Matrices Only)

A hand-optimised DX9 asm version for column-major "euclidean" matrices that only uses 21 instructions. Note that it cannot be used for matrices that include a projection matrix and assumes that the 4th column is 0,0,0,1 - this is suitable to inverse a view or world-view matrix, but will not work for a projection or view-projection matrix - if you are not sure what kind of matrix you are inversing, do not use this version. Also note that the resulting matrix will be transposed to row-major order, so you may need to switch to mul/mad/mad/add type matrix multiply when you use this rather than dp4/dp4/dp4/dp4.

Source matrix is in r20-r23, destination is in r26-r29. r31.x must be 0. r25 and r30 are used for temporary values and will be clobbered.

    // Do some multiplications & subtractions in parallel with SIMD instructions:
    mul r25.xyz, r20.zxy, r21.yzx // m0.z*m1.y, m0.x*m1.z, m0.y*m1.x
    mad r25.xyz, r20.yzx, r21.zxy, -r25.xyz // m0.y*m1.z - m0.z*m1.y, m0.z*m1.x - m0.x*m1.z, m0.x*m1.y - m0.y*m1.x
    // Now the multiplications:
    mul r25.xyz, r25.xyz, r22.xyz
    // Sum it together to get the determinant:
    add r30.x, r25.x, r25.y
    add r30.x, r30.x, r25.z
 
    // 1st row, simplifying by assuimg the 4th column 0,0,0,1
    // dst0.x = (m1.y*m2.z - m1.z*m2.y)
    // dst0.y = (m1.z*m2.x - m1.x*m2.z)
    // dst0.z = (m1.x*m2.y - m1.y*m2.x)
    // dst0.w = 0
 
    mul r26.xyz, r21.zxy, r22.yzx
    mad r26.xyz, r21.yzx, r22.zxy, -r26.xyz
 
    // 2nd row
    // dst1.x = (col0.z*m2.y - col0.y*m2.z)
    // dst1.y = (col0.x*m2.z - col0.z*m2.x)
    // dst1.z = (col0.y*m2.x - col0.x*m2.y)
    // dst1.w = 0
 
    mul r27.xyz, r20.yzx, r22.zxy
    mad r27.xyz, r20.zxy, r22.yzx, -r27.xyz
 
    // 3nd row
    // dst2.x = (col0.y*m1.z - col0.z*m1.y)
    // dst2.y = (col0.z*m1.x - col0.x*m1.z)
    // dst2.z = (col0.x*m1.y - col0.y*m1.x)
    // dst2.w = 0
 
    mul r28.xyz, r20.zxy, r21.yzx
    mad r28.xyz, r20.yzx, r21.zxy, -r28.xyz
 
    // 4th row
    // dst3.x = - col0.w*dst0.x - col1.w*dst1.x - col2.w*dst2.x
    // dst3.y = - col0.w*dst0.y - col1.w*dst1.y - col2.w*dst2.y
    // dst3.z = - col0.w*dst0.z - col1.w*dst1.z - col2.w*dst2.z
    // dst3.w =   col0.x*dst0.x + col1.x*dst1.x + col2.x*dst2.x (always 1?)
 
    mul r29.xyzw, r20.wwwx, r26.xyzx
    mad r29.xyzw, r21.wwwx, r27.xyzx, r29.xyzw
    mad r29.xyzw, r22.wwwx, r28.xyzx, r29.xyzw
    mov r29.xyz, -r29
 
    // Multiply against 1/determinant (and zero out 4th column):
    rcp r25.x, r30.x
    mov r25.y, r31.x
    mul r26, r26, r25.xxxy
    mul r27, r27, r25.xxxy
    mul r28, r28, r25.xxxy
    mul r29, r29, r25.xxxx
 
    // Note that this matrix has been transposed and is now in ROW major order!

The above code is generated by DarkStarSword's pyasm.py and matrix.py (both of which are available in my 3d-fixes repository), which can be used to generate variants of the above using whichever registers are needed. I never quite got around to making this easy to use, but the above code was created with these commands:

    $ python3
    >>> import pyasm
    >>> import matrix
    >>> pyasm.py_to_asm(matrix._determinant_euclidean_asm_col_major, col0='r20', col1='r21', col2='r22', det='r30', tmp0='r25')
    >>> pyasm.py_to_asm(matrix._inverse_euclidean_asm_col_major, col0='r20', col1='r21', col2='r22', det='r30', dst0='r26', dst1='r27', dst2='r28', dst3='r29', inv_det='r25', std_consts='r31')

Two more variants, both from Helifax. In case those previous ones don't seem quite right.

HLSL

//Work out Inverse
//...Variables
float4 a1, a2, a3, a4;
float4 b1, b2, b3, b4;
float det;
//...Original Matrix
a1 = g_invViewProjMatrix._m00_m10_m20_m30;
a2 = g_invViewProjMatrix._m01_m11_m21_m31;
a3 = g_invViewProjMatrix._m02_m12_m22_m32;
a4 = g_invViewProjMatrix._m03_m13_m23_m33;
//...Determinant
det  = a1.x*(a2.y*(a3.z*a4.w - a3.w*a4.z) + a2.z*(a3.w*a4.y - a3.y*a4.w) + a2.w*(a3.y*a4.z - a3.z*a4.y));
det += a1.y*(a2.x*(a3.w*a4.z - a3.z*a4.w) + a2.z*(a3.x*a4.w - a3.w*a4.z) + a2.w*(a3.z*a4.x - a3.x*a4.z));
det += a1.z*(a2.x*(a3.y*a4.w - a3.w*a4.y) + a2.y*(a3.w*a4.x - a3.x*a4.w) + a2.w*(a3.x*a4.y - a3.y*a4.x));
det += a1.w*(a2.x*(a3.z*a4.y - a3.y*a4.z) + a2.y*(a3.x*a4.z - a3.z*a4.x) + a2.z*(a3.y*a4.x - a3.x*a4.y));
//...Inverse Matrix Elements
b1.x = a2.y*(a3.z*a4.w - a3.w*a4.z) + a2.z*(a3.w*a4.y - a3.y*a4.w) + a2.w*(a3.y*a4.z - a3.z*a4.y);
b1.y = a1.y*(a3.w*a4.z - a3.z*a4.w) + a1.z*(a3.y*a4.w - a3.w*a4.y) + a1.w*(a3.z*a4.y - a3.y*a4.z);
b1.z = a1.y*(a2.z*a4.w - a2.w*a4.z) + a1.z*(a2.w*a4.y - a2.y*a4.w) + a1.w*(a2.y*a4.z - a2.z*a4.y);
b1.w = a1.y*(a2.w*a3.z - a2.z*a3.w) + a1.z*(a2.y*a3.w - a2.w*a3.y) + a1.w*(a2.z*a3.y - a2.y*a3.z);
b2.x = a2.x*(a3.w*a4.z - a3.z*a4.w) + a2.z*(a3.x*a4.w - a3.w*a4.x) + a2.w*(a3.z*a4.x - a3.x*a4.z);
b2.y = a1.x*(a3.z*a4.w - a3.w*a4.z) + a1.z*(a3.w*a4.x - a3.x*a4.w) + a1.w*(a3.x*a4.z - a3.z*a4.x);
b2.z = a1.x*(a2.w*a4.z - a2.z*a4.w) + a1.z*(a2.x*a4.w - a2.w*a4.x) + a1.w*(a2.z*a4.x - a2.x*a4.z);
b2.w = a1.x*(a2.z*a3.w - a2.w*a3.z) + a1.z*(a2.w*a3.x - a2.x*a3.w) + a1.w*(a2.x*a3.z - a2.z*a3.x);
b3.x = a2.x*(a3.y*a4.w - a3.w*a4.y) + a2.y*(a3.w*a4.x - a3.x*a4.w) + a2.w*(a3.x*a4.y - a3.y*a4.x);
b3.y = a1.x*(a3.w*a4.y - a3.y*a4.w) + a1.y*(a3.x*a4.w - a3.w*a4.x) + a1.w*(a3.y*a4.x - a3.x*a4.y);
b3.z = a1.x*(a2.y*a4.w - a2.w*a4.y) + a1.y*(a2.w*a4.x - a2.x*a4.w) + a1.w*(a2.x*a4.y - a2.y*a4.x);
b3.w = a1.x*(a2.w*a3.y - a2.y*a3.w) + a1.y*(a2.x*a3.w - a2.w*a3.x) + a1.w*(a2.y*a3.x - a2.x*a3.y);
b4.x = a2.x*(a3.z*a4.y - a3.y*a4.z) + a2.y*(a3.x*a4.z - a3.z*a4.x) + a2.z*(a3.y*a4.x - a3.x*a4.y);
b4.y = a1.x*(a3.y*a4.z - a3.z*a4.y) + a1.y*(a3.z*a4.x - a3.x*a4.z) + a1.z*(a3.x*a4.y - a3.y*a4.x);
b4.z = a1.x*(a2.z*a4.y - a2.y*a4.z) + a1.y*(a2.x*a4.z - a2.z*a4.x) + a1.z*(a2.y*a4.x - a2.x*a4.y);
b4.w = a1.x*(a2.y*a3.z - a2.z*a3.y) + a1.y*(a2.z*a3.x - a2.x*a3.z) + a1.z*(a2.x*a3.y - a2.y*a3.x);
b1.xyzw /= det;
b2.xyzw /= det;
b3.xyzw /= det;
b4.xyzw /= det;
//End Inverse


ASM

Generated from the HLSL, using fxc.

// Declare how many registers ww use
// The code uses registers from r38 to r53.
dcl_temps 60
 
// 3DMigoto StereoParams:
dcl_resource_texture1d (float,float,float,float) t120
dcl_resource_texture2d (float,float,float,float) t125
ld_indexable(texture1d)(float,float,float,float) r41.xyzw, l(0, 0, 0, 0), t120.xyzw
ld_indexable(texture2d)(float,float,float,float) r40.xyzw, l(0, 0, 0, 0), t125.xyzw
 
// Inverse
// cb0[0], etc is the inverseMatrix
mov r0.xyzw, cb0[0].xyzw
mov r1.xyzw, cb0[1].xyzw
mov r2.xyzw, cb0[2].xyzw
mov r3.xyzw, cb0[3].xyzw
mul r4.x, r2.z, r3.w
mul r4.y, r2.w, r3.z
mov r4.y, -r4.y
add r4.x, r4.y, r4.x
mul r4.x, r1.y, r4.x
mul r4.y, r2.w, r3.y
mul r4.z, r2.y, r3.w
mov r4.z, -r4.z
add r4.y, r4.z, r4.y
mul r4.y, r1.z, r4.y
add r4.x, r4.y, r4.x
mul r4.y, r2.y, r3.z
mul r4.z, r2.z, r3.y
mov r4.z, -r4.z
add r4.y, r4.z, r4.y
mul r4.y, r1.w, r4.y
add r4.x, r4.y, r4.x
mul r4.x, r0.x, r4.x
mul r4.y, r2.w, r3.z
mul r4.z, r2.z, r3.w
mov r4.z, -r4.z
add r4.y, r4.z, r4.y
mul r4.y, r1.x, r4.y
mul r4.z, r2.x, r3.w
mul r4.w, r2.w, r3.z
mov r4.w, -r4.w
add r4.z, r4.w, r4.z
mul r4.z, r1.z, r4.z
add r4.y, r4.z, r4.y
mul r4.z, r2.z, r3.x
mul r4.w, r2.x, r3.z
mov r4.w, -r4.w
add r4.z, r4.w, r4.z
mul r4.z, r1.w, r4.z
add r4.y, r4.z, r4.y
mul r4.y, r0.y, r4.y
add r4.x, r4.y, r4.x
mul r4.y, r2.y, r3.w
mul r4.z, r2.w, r3.y
mov r4.z, -r4.z
add r4.y, r4.z, r4.y
mul r4.y, r1.x, r4.y
mul r4.z, r2.w, r3.x
mul r4.w, r2.x, r3.w
mov r4.w, -r4.w
add r4.z, r4.w, r4.z
mul r4.z, r1.y, r4.z
add r4.y, r4.z, r4.y
mul r4.z, r2.x, r3.y
mul r4.w, r2.y, r3.x
mov r4.w, -r4.w
add r4.z, r4.w, r4.z
mul r4.z, r1.w, r4.z
add r4.y, r4.z, r4.y
mul r4.y, r0.z, r4.y
add r4.x, r4.y, r4.x
mul r4.y, r2.z, r3.y
mul r4.z, r2.y, r3.z
mov r4.z, -r4.z
add r4.y, r4.z, r4.y
mul r4.y, r1.x, r4.y
mul r4.z, r2.x, r3.z
mul r4.w, r2.z, r3.x
mov r4.w, -r4.w
add r4.z, r4.w, r4.z
mul r4.z, r1.y, r4.z
add r4.y, r4.z, r4.y
mul r4.z, r2.y, r3.x
mul r4.w, r2.x, r3.y
mov r4.w, -r4.w
add r4.z, r4.w, r4.z
mul r4.z, r1.z, r4.z
add r4.y, r4.z, r4.y
mul r4.y, r0.w, r4.y
add r4.x, r4.y, r4.x
mul r4.y, r2.z, r3.w
mul r4.z, r2.w, r3.z
mov r4.z, -r4.z
add r4.y, r4.z, r4.y
mul r4.y, r1.y, r4.y
mul r4.z, r2.w, r3.y
mul r4.w, r2.y, r3.w
mov r4.w, -r4.w
add r4.z, r4.w, r4.z
mul r4.z, r1.z, r4.z
add r4.y, r4.z, r4.y
mul r4.z, r2.y, r3.z
mul r4.w, r2.z, r3.y
mov r4.w, -r4.w
add r4.z, r4.w, r4.z
mul r4.z, r1.w, r4.z
add r5.x, r4.z, r4.y
mul r4.y, r2.w, r3.z
mul r4.z, r2.z, r3.w
mov r4.z, -r4.z
add r4.y, r4.z, r4.y
mul r4.y, r0.y, r4.y
mul r4.z, r2.y, r3.w
mul r4.w, r2.w, r3.y
mov r4.w, -r4.w
add r4.z, r4.w, r4.z
mul r4.z, r0.z, r4.z
add r4.y, r4.z, r4.y
mul r4.z, r2.z, r3.y
mul r4.w, r2.y, r3.z
mov r4.w, -r4.w
add r4.z, r4.w, r4.z
mul r4.z, r0.w, r4.z
add r5.y, r4.z, r4.y
mul r4.y, r1.z, r3.w
mul r4.z, r1.w, r3.z
mov r4.z, -r4.z
add r4.y, r4.z, r4.y
mul r4.y, r0.y, r4.y
mul r4.z, r1.w, r3.y
mul r4.w, r1.y, r3.w
mov r4.w, -r4.w
add r4.z, r4.w, r4.z
mul r4.z, r0.z, r4.z
add r4.y, r4.z, r4.y
mul r4.z, r1.y, r3.z
mul r4.w, r1.z, r3.y
mov r4.w, -r4.w
add r4.z, r4.w, r4.z
mul r4.z, r0.w, r4.z
add r5.z, r4.z, r4.y
mul r4.y, r1.w, r2.z
mul r4.z, r1.z, r2.w
mov r4.z, -r4.z
add r4.y, r4.z, r4.y
mul r4.y, r0.y, r4.y
mul r4.z, r1.y, r2.w
mul r4.w, r1.w, r2.y
mov r4.w, -r4.w
add r4.z, r4.w, r4.z
mul r4.z, r0.z, r4.z
add r4.y, r4.z, r4.y
mul r4.z, r1.z, r2.y
mul r4.w, r1.y, r2.z
mov r4.w, -r4.w
add r4.z, r4.w, r4.z
mul r4.z, r0.w, r4.z
add r5.w, r4.z, r4.y
mul r4.y, r2.w, r3.z
mul r4.z, r2.z, r3.w
mov r4.z, -r4.z
add r4.y, r4.z, r4.y
mul r4.y, r1.x, r4.y
mul r4.z, r2.x, r3.w
mul r4.w, r2.w, r3.x
mov r4.w, -r4.w
add r4.z, r4.w, r4.z
mul r4.z, r1.z, r4.z
add r4.y, r4.z, r4.y
mul r4.z, r2.z, r3.x
mul r4.w, r2.x, r3.z
mov r4.w, -r4.w
add r4.z, r4.w, r4.z
mul r4.z, r1.w, r4.z
add r6.x, r4.z, r4.y
mul r4.y, r2.z, r3.w
mul r4.z, r2.w, r3.z
mov r4.z, -r4.z
add r4.y, r4.z, r4.y
mul r4.y, r0.x, r4.y
mul r4.z, r2.w, r3.x
mul r4.w, r2.x, r3.w
mov r4.w, -r4.w
add r4.z, r4.w, r4.z
mul r4.z, r0.z, r4.z
add r4.y, r4.z, r4.y
mul r4.z, r2.x, r3.z
mul r4.w, r2.z, r3.x
mov r4.w, -r4.w
add r4.z, r4.w, r4.z
mul r4.z, r0.w, r4.z
add r6.y, r4.z, r4.y
mul r4.y, r1.w, r3.z
mul r4.z, r1.z, r3.w
mov r4.z, -r4.z
add r4.y, r4.z, r4.y
mul r4.y, r0.x, r4.y
mul r4.z, r1.x, r3.w
mul r4.w, r1.w, r3.x
mov r4.w, -r4.w
add r4.z, r4.w, r4.z
mul r4.z, r0.z, r4.z
add r4.y, r4.z, r4.y
mul r4.z, r1.z, r3.x
mul r4.w, r1.x, r3.z
mov r4.w, -r4.w
add r4.z, r4.w, r4.z
mul r4.z, r0.w, r4.z
add r6.z, r4.z, r4.y
mul r4.y, r1.z, r2.w
mul r4.z, r1.w, r2.z
mov r4.z, -r4.z
add r4.y, r4.z, r4.y
mul r4.y, r0.x, r4.y
mul r4.z, r1.w, r2.x
mul r4.w, r1.x, r2.w
mov r4.w, -r4.w
add r4.z, r4.w, r4.z
mul r4.z, r0.z, r4.z
add r4.y, r4.z, r4.y
mul r4.z, r1.x, r2.z
mul r4.w, r1.z, r2.x
mov r4.w, -r4.w
add r4.z, r4.w, r4.z
mul r4.z, r0.w, r4.z
add r6.w, r4.z, r4.y
mul r4.y, r2.y, r3.w
mul r4.z, r2.w, r3.y
mov r4.z, -r4.z
add r4.y, r4.z, r4.y
mul r4.y, r1.x, r4.y
mul r4.z, r2.w, r3.x
mul r4.w, r2.x, r3.w
mov r4.w, -r4.w
add r4.z, r4.w, r4.z
mul r4.z, r1.y, r4.z
add r4.y, r4.z, r4.y
mul r4.z, r2.x, r3.y
mul r4.w, r2.y, r3.x
mov r4.w, -r4.w
add r4.z, r4.w, r4.z
mul r4.z, r1.w, r4.z
add r7.x, r4.z, r4.y
mul r4.y, r2.w, r3.y
mul r4.z, r2.y, r3.w
mov r4.z, -r4.z
add r4.y, r4.z, r4.y
mul r4.y, r0.x, r4.y
mul r4.z, r2.x, r3.w
mul r4.w, r2.w, r3.x
mov r4.w, -r4.w
add r4.z, r4.w, r4.z
mul r4.z, r0.y, r4.z
add r4.y, r4.z, r4.y
mul r4.z, r2.y, r3.x
mul r4.w, r2.x, r3.y
mov r4.w, -r4.w
add r4.z, r4.w, r4.z
mul r4.z, r0.w, r4.z
add r7.y, r4.z, r4.y
mul r4.y, r1.y, r3.w
mul r4.z, r1.w, r3.y
mov r4.z, -r4.z
add r4.y, r4.z, r4.y
mul r4.y, r0.x, r4.y
mul r4.z, r1.w, r3.x
mul r3.w, r1.x, r3.w
mov r3.w, -r3.w
add r3.w, r3.w, r4.z
mul r3.w, r0.y, r3.w
add r3.w, r3.w, r4.y
mul r4.y, r1.x, r3.y
mul r4.z, r1.y, r3.x
mov r4.z, -r4.z
add r4.y, r4.z, r4.y
mul r4.y, r0.w, r4.y
add r7.z, r3.w, r4.y
mul r3.w, r1.w, r2.y
mul r4.y, r1.y, r2.w
mov r4.y, -r4.y
add r3.w, r3.w, r4.y
mul r3.w, r0.x, r3.w
mul r2.w, r1.x, r2.w
mul r1.w, r1.w, r2.x
mov r1.w, -r1.w
add r1.w, r1.w, r2.w
mul r1.w, r0.y, r1.w
add r1.w, r1.w, r3.w
mul r2.w, r1.y, r2.x
mul r3.w, r1.x, r2.y
mov r3.w, -r3.w
add r2.w, r2.w, r3.w
mul r0.w, r0.w, r2.w
add r7.w, r0.w, r1.w
mul r0.w, r2.z, r3.y
mul r1.w, r2.y, r3.z
mov r1.w, -r1.w
add r0.w, r0.w, r1.w
mul r0.w, r0.w, r1.x
mul r1.w, r2.x, r3.z
mul r2.w, r2.z, r3.x
mov r2.w, -r2.w
add r1.w, r1.w, r2.w
mul r1.w, r1.w, r1.y
add r0.w, r0.w, r1.w
mul r1.w, r2.y, r3.x
mul r2.w, r2.x, r3.y
mov r2.w, -r2.w
add r1.w, r1.w, r2.w
mul r1.w, r1.w, r1.z
add r8.x, r0.w, r1.w
mul r0.w, r2.y, r3.z
mul r1.w, r2.z, r3.y
mov r1.w, -r1.w
add r0.w, r0.w, r1.w
mul r0.w, r0.w, r0.x
mul r1.w, r2.z, r3.x
mul r2.w, r2.x, r3.z
mov r2.w, -r2.w
add r1.w, r1.w, r2.w
mul r1.w, r0.y, r1.w
add r0.w, r0.w, r1.w
mul r1.w, r2.x, r3.y
mul r2.w, r2.y, r3.x
mov r2.w, -r2.w
add r1.w, r1.w, r2.w
mul r1.w, r0.z, r1.w
add r8.y, r0.w, r1.w
mul r0.w, r1.z, r3.y
mul r1.w, r1.y, r3.z
mov r1.w, -r1.w
add r0.w, r0.w, r1.w
mul r0.w, r0.w, r0.x
mul r1.w, r1.x, r3.z
mul r2.w, r1.z, r3.x
mov r2.w, -r2.w
add r1.w, r1.w, r2.w
mul r1.w, r0.y, r1.w
add r0.w, r0.w, r1.w
mul r1.w, r1.y, r3.x
mul r2.w, r1.x, r3.y
mov r2.w, -r2.w
add r1.w, r1.w, r2.w
mul r1.w, r0.z, r1.w
add r8.z, r0.w, r1.w
mul r0.w, r1.y, r2.z
mul r1.w, r1.z, r2.y
mov r1.w, -r1.w
add r0.w, r0.w, r1.w
mul r0.x, r0.w, r0.x
mul r0.w, r1.z, r2.x
mul r1.z, r1.x, r2.z
mov r1.z, -r1.z
add r0.w, r0.w, r1.z
mul r0.y, r0.w, r0.y
add r0.x, r0.y, r0.x
mul r0.y, r1.x, r2.y
mul r0.w, r1.y, r2.x
mov r0.w, -r0.w
add r0.y, r0.w, r0.y
mul r0.y, r0.y, r0.z
add r8.w, r0.y, r0.x
div r0.xyzw, r5.xyzw, r4.xxxx
div r1.xyzw, r6.xyzw, r4.xxxx
div r2.xyzw, r7.xyzw, r4.xxxx
div r3.xyzw, r8.xyzw, r4.xxxx
 
// Store results for later use as r0-r4 are most 
// likely to be used by the default shader code.
// r50 equivalent of matrix._m00_m01_m02_m03
// r51 equivalent of matrix._m10_m11_m12_m13
// r52 equivalent of matrix._m20_m21_m22_m23
// r53 equivalent of matrix._m30_m31_m32_m33
 
mov r50.xyzw, r0.xyzw 
mov r51.xyzw, r1.xyzw 
mov r52.xyzw, r2.xyzw 
mov r53.xyzw, r3.xyzw