Update astcenc to the upstream 5.3.0 release

This is mostly a maintenance update that brings the compressor inline
with the recently published Khronos Data Format Specification 1.4
release which clarified some ambiguity in the specification. This update
also gives minor codec optimizations, bug fixes, and image quality
improvements.

The biggest improvement for Godot is that builds using MSVC cl.exe will
now correctly default to the SSE2-optimized backend rather than the
reference C backend. This makes compression more than 3 times faster.
Builds using other compilers (GCC, LLVM/Clang) were not impacted by the
underlying issue, and see no performance uplift.
This commit is contained in:
Peter Harris
2025-03-21 16:02:50 -07:00
parent 2303ce843a
commit 75ce42d463
24 changed files with 2068 additions and 813 deletions

View File

@ -43,6 +43,7 @@
#include <stdio.h>
#include <cassert>
#include <cstring>
#include <cfloat>
static constexpr unsigned int ANGULAR_STEPS { 32 };
@ -104,14 +105,17 @@ static void compute_angular_offsets(
// Precompute isample; arrays are always allocated 64 elements long
for (unsigned int i = 0; i < weight_count; i += ASTCENC_SIMD_WIDTH)
{
// Add 2^23 and interpreting bits extracts round-to-nearest int
vfloat sample = loada(dec_weight_ideal_value + i) * (SINCOS_STEPS - 1.0f) + vfloat(12582912.0f);
vint isample = float_as_int(sample) & vint((SINCOS_STEPS - 1));
// Ideal weight can be outside [0, 1] range, so clamp to fit table
vfloat ideal_weight = clampzo(loada(dec_weight_ideal_value + i));
// Convert a weight to a sincos table index
vfloat sample = ideal_weight * (SINCOS_STEPS - 1.0f);
vint isample = float_to_int_rtn(sample);
storea(isample, isamplev + i);
}
// Arrays are multiple of SIMD width (ANGULAR_STEPS), safe to overshoot max
vfloat mult = vfloat(1.0f / (2.0f * astc::PI));
vfloat mult(1.0f / (2.0f * astc::PI));
for (unsigned int i = 0; i < max_angular_steps; i += ASTCENC_SIMD_WIDTH)
{
@ -164,18 +168,41 @@ static void compute_lowest_and_highest_weight(
promise(weight_count > 0);
promise(max_angular_steps > 0);
vfloat rcp_stepsize = vfloat::lane_id() + vfloat(1.0f);
vfloat rcp_stepsize = int_to_float(vint::lane_id()) + vfloat(1.0f);
// Compute minimum/maximum weights in the weight array. Our remapping
// is monotonic, so the min/max rounded weights relate to the min/max
// unrounded weights in a straightforward way.
vfloat min_weight(FLT_MAX);
vfloat max_weight(-FLT_MAX);
vint lane_id = vint::lane_id();
for (unsigned int i = 0; i < weight_count; i += ASTCENC_SIMD_WIDTH)
{
vmask active = lane_id < vint(weight_count);
lane_id += vint(ASTCENC_SIMD_WIDTH);
vfloat weights = loada(dec_weight_ideal_value + i);
min_weight = min(min_weight, select(min_weight, weights, active));
max_weight = max(max_weight, select(max_weight, weights, active));
}
min_weight = hmin(min_weight);
max_weight = hmax(max_weight);
// Arrays are ANGULAR_STEPS long, so always safe to run full vectors
for (unsigned int sp = 0; sp < max_angular_steps; sp += ASTCENC_SIMD_WIDTH)
{
vfloat minidx(128.0f);
vfloat maxidx(-128.0f);
vfloat errval = vfloat::zero();
vfloat cut_low_weight_err = vfloat::zero();
vfloat cut_high_weight_err = vfloat::zero();
vfloat offset = loada(offsets + sp);
// We know the min and max weight values, so we can figure out
// the corresponding indices before we enter the loop.
vfloat minidx = round(min_weight * rcp_stepsize - offset);
vfloat maxidx = round(max_weight * rcp_stepsize - offset);
for (unsigned int j = 0; j < weight_count; j++)
{
vfloat sval = load1(dec_weight_ideal_value + j) * rcp_stepsize - offset;
@ -183,22 +210,12 @@ static void compute_lowest_and_highest_weight(
vfloat diff = sval - svalrte;
errval += diff * diff;
// Reset tracker on min hit
vmask mask = svalrte < minidx;
minidx = select(minidx, svalrte, mask);
cut_low_weight_err = select(cut_low_weight_err, vfloat::zero(), mask);
// Accumulate on min hit
mask = svalrte == minidx;
// Accumulate errors for minimum index
vmask mask = svalrte == minidx;
vfloat accum = cut_low_weight_err + vfloat(1.0f) - vfloat(2.0f) * diff;
cut_low_weight_err = select(cut_low_weight_err, accum, mask);
// Reset tracker on max hit
mask = svalrte > maxidx;
maxidx = select(maxidx, svalrte, mask);
cut_high_weight_err = select(cut_high_weight_err, vfloat::zero(), mask);
// Accumulate on max hit
// Accumulate errors for maximum index
mask = svalrte == maxidx;
accum = cut_high_weight_err + vfloat(1.0f) + vfloat(2.0f) * diff;
cut_high_weight_err = select(cut_high_weight_err, accum, mask);