From d25e2c18e48d2d6af11adbed119e744e17ec2976 Mon Sep 17 00:00:00 2001
From: Lain <lain@obsproject.com>
Date: Mon, 17 Mar 2025 21:54:01 -0700
Subject: [PATCH] libobs/graphics: Optimize certain matrix4 functions

Optimizes certain 4x4 matrix functions by minimizing unnecessary math
operations. This ends up optimizing gs_matrix_transform() and
gs_matrix_rotate() functions and makes positional/rotational operations
a bit more optimal and requiring much less math than they would
otherwise.
---
 libobs/graphics/matrix4.c | 103 ++++++++++++++++++++++++++++----------
 1 file changed, 76 insertions(+), 27 deletions(-)

diff --git a/libobs/graphics/matrix4.c b/libobs/graphics/matrix4.c
index 8168ecbce..33476d26b 100644
--- a/libobs/graphics/matrix4.c
+++ b/libobs/graphics/matrix4.c
@@ -59,20 +59,57 @@ void matrix4_from_axisang(struct matrix4 *dst, const struct axisang *aa)
 
 void matrix4_mul(struct matrix4 *dst, const struct matrix4 *m1, const struct matrix4 *m2)
 {
-	const struct vec4 *m1v = (const struct vec4 *)m1;
-	const float *m2f = (const float *)m2;
-	struct vec4 out[4];
-	int i, j;
+	struct matrix4 transposed;
+	struct matrix4 out;
 
-	for (i = 0; i < 4; i++) {
-		for (j = 0; j < 4; j++) {
-			struct vec4 temp;
-			vec4_set(&temp, m2f[j], m2f[j + 4], m2f[j + 8], m2f[j + 12]);
-			out[i].ptr[j] = vec4_dot(&m1v[i], &temp);
-		}
-	}
+	matrix4_transpose(&transposed, m2);
 
-	matrix4_copy(dst, (struct matrix4 *)out);
+	out.x.x = vec4_dot(&m1->x, &transposed.x);
+	out.x.y = vec4_dot(&m1->x, &transposed.y);
+	out.x.z = vec4_dot(&m1->x, &transposed.z);
+	out.x.w = vec4_dot(&m1->x, &transposed.t);
+	out.y.x = vec4_dot(&m1->y, &transposed.x);
+	out.y.y = vec4_dot(&m1->y, &transposed.y);
+	out.y.z = vec4_dot(&m1->y, &transposed.z);
+	out.y.w = vec4_dot(&m1->y, &transposed.t);
+	out.z.x = vec4_dot(&m1->z, &transposed.x);
+	out.z.y = vec4_dot(&m1->z, &transposed.y);
+	out.z.z = vec4_dot(&m1->z, &transposed.z);
+	out.z.w = vec4_dot(&m1->z, &transposed.t);
+	out.t.x = vec4_dot(&m1->t, &transposed.x);
+	out.t.y = vec4_dot(&m1->t, &transposed.y);
+	out.t.z = vec4_dot(&m1->t, &transposed.z);
+	out.t.w = vec4_dot(&m1->t, &transposed.t);
+
+	matrix4_copy(dst, &out);
+}
+
+void matrix4_mul_4x3_only(struct matrix4 *dst, const struct matrix4 *m1, const struct matrix4 *m2)
+{
+	struct matrix4 transposed;
+	struct vec4 x;
+	struct vec4 y;
+	struct vec4 z;
+
+	matrix4_transpose(&transposed, m2);
+
+	x.x = vec4_dot(&m1->x, &transposed.x);
+	x.y = vec4_dot(&m1->x, &transposed.y);
+	x.z = vec4_dot(&m1->x, &transposed.z);
+	x.w = vec4_dot(&m1->x, &transposed.t);
+	y.x = vec4_dot(&m1->y, &transposed.x);
+	y.y = vec4_dot(&m1->y, &transposed.y);
+	y.z = vec4_dot(&m1->y, &transposed.z);
+	y.w = vec4_dot(&m1->y, &transposed.t);
+	z.x = vec4_dot(&m1->z, &transposed.x);
+	z.y = vec4_dot(&m1->z, &transposed.y);
+	z.z = vec4_dot(&m1->z, &transposed.z);
+	z.w = vec4_dot(&m1->z, &transposed.t);
+
+	vec4_copy(&dst->x, &x);
+	vec4_copy(&dst->y, &y);
+	vec4_copy(&dst->z, &z);
+	vec4_copy(&dst->t, &m2->t);
 }
 
 static inline void get_3x3_submatrix(float *dst, const struct matrix4 *m, int i, int j)
@@ -172,38 +209,50 @@ void matrix4_scale(struct matrix4 *dst, const struct matrix4 *m, const struct ve
 
 void matrix4_translate3v_i(struct matrix4 *dst, const struct vec3 *v, const struct matrix4 *m)
 {
-	struct matrix4 temp;
-	vec4_set(&temp.x, 1.0f, 0.0f, 0.0f, 0.0f);
-	vec4_set(&temp.y, 0.0f, 1.0f, 0.0f, 0.0f);
-	vec4_set(&temp.z, 0.0f, 0.0f, 1.0f, 0.0f);
-	vec4_from_vec3(&temp.t, v);
+	struct matrix4 transposed;
+	struct vec4 v4;
+	struct vec4 t;
 
-	matrix4_mul(dst, &temp, m);
+	vec4_from_vec3(&v4, v);
+	matrix4_transpose(&transposed, m);
+	t.x = vec4_dot(&v4, &transposed.x);
+	t.y = vec4_dot(&v4, &transposed.y);
+	t.z = vec4_dot(&v4, &transposed.z);
+	t.w = vec4_dot(&v4, &transposed.t);
+	vec4_copy(&dst->x, &m->x);
+	vec4_copy(&dst->y, &m->y);
+	vec4_copy(&dst->z, &m->z);
+	vec4_copy(&dst->t, &t);
 }
 
 void matrix4_translate4v_i(struct matrix4 *dst, const struct vec4 *v, const struct matrix4 *m)
 {
-	struct matrix4 temp;
-	vec4_set(&temp.x, 1.0f, 0.0f, 0.0f, 0.0f);
-	vec4_set(&temp.y, 0.0f, 1.0f, 0.0f, 0.0f);
-	vec4_set(&temp.z, 0.0f, 0.0f, 1.0f, 0.0f);
-	vec4_copy(&temp.t, v);
+	struct matrix4 transposed;
+	struct vec4 t;
 
-	matrix4_mul(dst, &temp, m);
+	matrix4_transpose(&transposed, m);
+	t.x = vec4_dot(v, &transposed.x);
+	t.y = vec4_dot(v, &transposed.y);
+	t.z = vec4_dot(v, &transposed.z);
+	t.w = vec4_dot(v, &transposed.t);
+	vec4_copy(&dst->x, &m->x);
+	vec4_copy(&dst->y, &m->y);
+	vec4_copy(&dst->z, &m->z);
+	vec4_copy(&dst->t, &t);
 }
 
 void matrix4_rotate_i(struct matrix4 *dst, const struct quat *q, const struct matrix4 *m)
 {
 	struct matrix4 temp;
 	matrix4_from_quat(&temp, q);
-	matrix4_mul(dst, &temp, m);
+	matrix4_mul_4x3_only(dst, &temp, m);
 }
 
 void matrix4_rotate_aa_i(struct matrix4 *dst, const struct axisang *aa, const struct matrix4 *m)
 {
 	struct matrix4 temp;
 	matrix4_from_axisang(&temp, aa);
-	matrix4_mul(dst, &temp, m);
+	matrix4_mul_4x3_only(dst, &temp, m);
 }
 
 void matrix4_scale_i(struct matrix4 *dst, const struct vec3 *v, const struct matrix4 *m)
@@ -213,7 +262,7 @@ void matrix4_scale_i(struct matrix4 *dst, const struct vec3 *v, const struct mat
 	vec4_set(&temp.y, 0.0f, v->y, 0.0f, 0.0f);
 	vec4_set(&temp.z, 0.0f, 0.0f, v->z, 0.0f);
 	vec4_set(&temp.t, 0.0f, 0.0f, 0.0f, 1.0f);
-	matrix4_mul(dst, &temp, m);
+	matrix4_mul_4x3_only(dst, &temp, m);
 }
 
 bool matrix4_inv(struct matrix4 *dst, const struct matrix4 *m)