Update astcenc to the upstream 5.3.0 release

This is mostly a maintenance update that brings the compressor inline with the recently published Khronos Data Format Specification 1.4 release which clarified some ambiguity in the specification. This update also gives minor codec optimizations, bug fixes, and image quality improvements. The biggest improvement for Godot is that builds using MSVC cl.exe will now correctly default to the SSE2-optimized backend rather than the reference C backend. This makes compression more than 3 times faster. Builds using other compilers (GCC, LLVM/Clang) were not impacted by the underlying issue, and see no performance uplift.
2025-03-21 16:02:50 -07:00
parent 2303ce843a
commit 75ce42d463
24 changed files with 2068 additions and 813 deletions
--- a/thirdparty/astcenc/astcenc_weight_align.cpp
+++ b/thirdparty/astcenc/astcenc_weight_align.cpp
@ -43,6 +43,7 @@
 #include <stdio.h>
 #include <cassert>
 #include <cstring>
+#include <cfloat>

 static constexpr unsigned int ANGULAR_STEPS { 32 };

@ -104,14 +105,17 @@ static void compute_angular_offsets(
 	// Precompute isample; arrays are always allocated 64 elements long
 	for (unsigned int i = 0; i < weight_count; i += ASTCENC_SIMD_WIDTH)
 	{
-		// Add 2^23 and interpreting bits extracts round-to-nearest int
-		vfloat sample = loada(dec_weight_ideal_value + i) * (SINCOS_STEPS - 1.0f) + vfloat(12582912.0f);
-		vint isample = float_as_int(sample) & vint((SINCOS_STEPS - 1));
+		// Ideal weight can be outside [0, 1] range, so clamp to fit table
+		vfloat ideal_weight = clampzo(loada(dec_weight_ideal_value + i));
+
+		// Convert a weight to a sincos table index
+		vfloat sample = ideal_weight * (SINCOS_STEPS - 1.0f);
+		vint isample = float_to_int_rtn(sample);
 		storea(isample, isamplev + i);
 	}

 	// Arrays are multiple of SIMD width (ANGULAR_STEPS), safe to overshoot max
-	vfloat mult = vfloat(1.0f / (2.0f * astc::PI));
+	vfloat mult(1.0f / (2.0f * astc::PI));

 	for (unsigned int i = 0; i < max_angular_steps; i += ASTCENC_SIMD_WIDTH)
 	{
@ -164,18 +168,41 @@ static void compute_lowest_and_highest_weight(
 	promise(weight_count > 0);
 	promise(max_angular_steps > 0);

-	vfloat rcp_stepsize = vfloat::lane_id() + vfloat(1.0f);
+	vfloat rcp_stepsize = int_to_float(vint::lane_id()) + vfloat(1.0f);
+
+	// Compute minimum/maximum weights in the weight array. Our remapping
+	// is monotonic, so the min/max rounded weights relate to the min/max
+	// unrounded weights in a straightforward way.
+	vfloat min_weight(FLT_MAX);
+	vfloat max_weight(-FLT_MAX);
+
+	vint lane_id = vint::lane_id();
+	for (unsigned int i = 0; i < weight_count; i += ASTCENC_SIMD_WIDTH)
+	{
+		vmask active = lane_id < vint(weight_count);
+		lane_id += vint(ASTCENC_SIMD_WIDTH);
+
+		vfloat weights = loada(dec_weight_ideal_value + i);
+		min_weight = min(min_weight, select(min_weight, weights, active));
+		max_weight = max(max_weight, select(max_weight, weights, active));
+	}
+
+	min_weight = hmin(min_weight);
+	max_weight = hmax(max_weight);

 	// Arrays are ANGULAR_STEPS long, so always safe to run full vectors
 	for (unsigned int sp = 0; sp < max_angular_steps; sp += ASTCENC_SIMD_WIDTH)
 	{
-		vfloat minidx(128.0f);
-		vfloat maxidx(-128.0f);
 		vfloat errval = vfloat::zero();
 		vfloat cut_low_weight_err = vfloat::zero();
 		vfloat cut_high_weight_err = vfloat::zero();
 		vfloat offset = loada(offsets + sp);

+		// We know the min and max weight values, so we can figure out
+		// the corresponding indices before we enter the loop.
+		vfloat minidx = round(min_weight * rcp_stepsize - offset);
+		vfloat maxidx = round(max_weight * rcp_stepsize - offset);
+
 		for (unsigned int j = 0; j < weight_count; j++)
 		{
 			vfloat sval = load1(dec_weight_ideal_value + j) * rcp_stepsize - offset;
@ -183,22 +210,12 @@ static void compute_lowest_and_highest_weight(
 			vfloat diff = sval - svalrte;
 			errval += diff * diff;

-			// Reset tracker on min hit
-			vmask mask = svalrte < minidx;
-			minidx = select(minidx, svalrte, mask);
-			cut_low_weight_err = select(cut_low_weight_err, vfloat::zero(), mask);
-
-			// Accumulate on min hit
-			mask = svalrte == minidx;
+			// Accumulate errors for minimum index
+			vmask mask = svalrte == minidx;
 			vfloat accum = cut_low_weight_err + vfloat(1.0f) - vfloat(2.0f) * diff;
 			cut_low_weight_err = select(cut_low_weight_err, accum, mask);

-			// Reset tracker on max hit
-			mask = svalrte > maxidx;
-			maxidx = select(maxidx, svalrte, mask);
-			cut_high_weight_err = select(cut_high_weight_err, vfloat::zero(), mask);
-
-			// Accumulate on max hit
+			// Accumulate errors for maximum index
 			mask = svalrte == maxidx;
 			accum = cut_high_weight_err + vfloat(1.0f) + vfloat(2.0f) * diff;
 			cut_high_weight_err = select(cut_high_weight_err, accum, mask);