From 457c74f7be972d84a4f13b53c3ad75ff71c9a039 Mon Sep 17 00:00:00 2001 From: Carl Woffenden Date: Sun, 4 Sep 2022 18:45:33 +0200 Subject: [PATCH 1/4] BC4/5 fixes and performance improvements This fixes #17 but goes further, since it provides higher accuracy for other blocks with few values. Two value blocks are special-cased to use the two endpoints. An early out is taken when the error reaches zero. --- rgbcx.cpp | 34 ++++++++++++++++++++++++++++------ rgbcx.h | 42 ++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 70 insertions(+), 6 deletions(-) diff --git a/rgbcx.cpp b/rgbcx.cpp index 609a4d4..063bbbb 100644 --- a/rgbcx.cpp +++ b/rgbcx.cpp @@ -2792,6 +2792,19 @@ namespace rgbcx return 0; } + // if we only have two values (min and max) the search radius can be set to zero (setting the endpoints directly) + bool has_two_values = true; + for (uint32_t i = 0; i < 16; i++) { + uint32_t val = pPixels[i * stride]; + if (val != min_val && val != max_val) { + has_two_values = false; + break; + } + } + if (has_two_values) { + search_rad = 0; + } + uint32_t best_err = UINT32_MAX; for (uint32_t mode = 0; mode < 2; mode++) { @@ -2817,8 +2830,9 @@ namespace rgbcx else if (!trial_block.is_alpha6_block()) std::swap(trial_block.m_endpoints[0], trial_block.m_endpoints[1]); - uint8_t block_vals[8]; - trial_block.get_block_values(block_vals, trial_block.m_endpoints[0], trial_block.m_endpoints[1]); + // note: block vals are expanded to 16-bit, as is the error + uint16_t block_vals16[8]; + trial_block.get_block_values(block_vals16, trial_block.m_endpoints[0], trial_block.m_endpoints[1]); uint32_t trial_err = 0; uint8_t trial_sels[16]; @@ -2827,8 +2841,10 @@ namespace rgbcx { memcpy(trial_sels, pForce_selectors, 16); - for (uint32_t i = 0; i < 16; i++) - trial_err += squarei(block_vals[pForce_selectors[i]] - pPixels[i * stride]); + for (uint32_t i = 0; i < 16; i++) { + int val = pPixels[i * stride]; + trial_err += squarei(block_vals16[pForce_selectors[i]] - ((val << 8) | val)); + } } else { @@ -2838,7 +2854,8 @@ namespace rgbcx uint32_t best_index = 0; for (uint32_t j = 0; j < 8; j++) { - uint32_t err = squarei(block_vals[j] - pPixels[i * stride]); + int val = pPixels[i * stride]; + uint32_t err = squarei(block_vals16[j] - ((val << 8) | val)); if (err < best_index_err) { best_index_err = err; @@ -2872,6 +2889,10 @@ namespace rgbcx trial_block.m_selectors[5] = (uint8_t)(sel_vals >> 40); memcpy(pDst_bytes, &trial_block, sizeof(bc4_block)); + if (best_err == 0) { + // early out since we have a zero error + goto error_reached_zero; + } } // if (trial_err < best_err) } // hi_delta @@ -2879,8 +2900,9 @@ namespace rgbcx } // lo_delta } // mode + error_reached_zero: - return best_err; + return best_err >> 8; } void encode_bc3(void* pDst, const uint8_t* pPixels, uint32_t flags, uint32_t total_orderings_to_try) diff --git a/rgbcx.h b/rgbcx.h index cf79392..78d4ab4 100644 --- a/rgbcx.h +++ b/rgbcx.h @@ -388,6 +388,7 @@ namespace rgbcx return (selector_bits >> (((y * 4) + x) * cBC4SelectorBits)) & (cMaxSelectorValues - 1); } + // Interpolated values as 8-bit (as per BC3 alpha) static inline uint32_t get_block_values6(uint8_t* pDst, uint32_t l, uint32_t h) { pDst[0] = static_cast(l); @@ -401,6 +402,23 @@ namespace rgbcx return 6; } + // Interpolated values expanded to 16-bit (as per BC4/5) + static inline uint32_t get_block_values6(uint16_t* pDst, uint32_t l, uint32_t h) + { + uint32_t l16 = (l << 8) | l; + uint32_t h16 = (h << 8) | h; + pDst[0] = static_cast(l16); + pDst[1] = static_cast(h16); + pDst[2] = static_cast((l16 * 4 + h16 ) / 5); + pDst[3] = static_cast((l16 * 3 + h16 * 2) / 5); + pDst[4] = static_cast((l16 * 2 + h16 * 3) / 5); + pDst[5] = static_cast((l16 + h16 * 4) / 5); + pDst[6] = 0; + pDst[7] = 65535; + return 6; + } + + // Interpolated values as 8-bit (as per BC3 alpha) static inline uint32_t get_block_values8(uint8_t* pDst, uint32_t l, uint32_t h) { pDst[0] = static_cast(l); @@ -414,6 +432,22 @@ namespace rgbcx return 8; } + // Interpolated values expanded to 16-bit (as per BC4/5) + static inline uint32_t get_block_values8(uint16_t* pDst, uint32_t l, uint32_t h) + { + uint32_t l16 = (l << 8) | l; + uint32_t h16 = (h << 8) | h; + pDst[0] = static_cast(l16); + pDst[1] = static_cast(h16); + pDst[2] = static_cast((l16 * 6 + h16 ) / 7); + pDst[3] = static_cast((l16 * 5 + h16 * 2) / 7); + pDst[4] = static_cast((l16 * 4 + h16 * 3) / 7); + pDst[5] = static_cast((l16 * 3 + h16 * 4) / 7); + pDst[6] = static_cast((l16 * 2 + h16 * 5) / 7); + pDst[7] = static_cast((l16 + h16 * 6) / 7); + return 8; + } + static inline uint32_t get_block_values(uint8_t* pDst, uint32_t l, uint32_t h) { if (l > h) @@ -421,6 +455,14 @@ namespace rgbcx else return get_block_values6(pDst, l, h); } + + static inline uint32_t get_block_values(uint16_t* pDst, uint32_t l, uint32_t h) + { + if (l > h) + return get_block_values8(pDst, l, h); + else + return get_block_values6(pDst, l, h); + } }; } From c07f3448a7f36597539f0f95cc534cbd5b6178e4 Mon Sep 17 00:00:00 2001 From: Carl Woffenden Date: Mon, 5 Sep 2022 11:06:56 +0200 Subject: [PATCH 2/4] Values expanded to 14-bit (to accumulated worse-case error) As 16-bit we couldn't accumulate the worst-case error without overflowing. Also fixed a bug whereby the values6 were truncated to 8-bit, therefore mostly favouring values8. The return from encode_bc4_hq() is now scaled to the same range from before the changes. --- rgbcx.cpp | 16 ++++++++-------- rgbcx.h | 47 ++++++++++++++++++++++++++--------------------- 2 files changed, 34 insertions(+), 29 deletions(-) diff --git a/rgbcx.cpp b/rgbcx.cpp index 063bbbb..f51b47c 100644 --- a/rgbcx.cpp +++ b/rgbcx.cpp @@ -2830,9 +2830,10 @@ namespace rgbcx else if (!trial_block.is_alpha6_block()) std::swap(trial_block.m_endpoints[0], trial_block.m_endpoints[1]); - // note: block vals are expanded to 16-bit, as is the error - uint16_t block_vals16[8]; - trial_block.get_block_values(block_vals16, trial_block.m_endpoints[0], trial_block.m_endpoints[1]); + // note: block vals are expanded to 8:6 fixed point, as is the error, + // with 8:6 able to accumulate 16x the worse-case error (255.98 ^ 2) + uint16_t block_vals14[8]; + trial_block.get_block_values(block_vals14, trial_block.m_endpoints[0], trial_block.m_endpoints[1]); uint32_t trial_err = 0; uint8_t trial_sels[16]; @@ -2842,8 +2843,7 @@ namespace rgbcx memcpy(trial_sels, pForce_selectors, 16); for (uint32_t i = 0; i < 16; i++) { - int val = pPixels[i * stride]; - trial_err += squarei(block_vals16[pForce_selectors[i]] - ((val << 8) | val)); + trial_err += squarei(block_vals14[pForce_selectors[i]] - bc4_block::expand8to14(pPixels[i * stride])); } } else @@ -2854,8 +2854,7 @@ namespace rgbcx uint32_t best_index = 0; for (uint32_t j = 0; j < 8; j++) { - int val = pPixels[i * stride]; - uint32_t err = squarei(block_vals16[j] - ((val << 8) | val)); + uint32_t err = squarei(block_vals14[j] - bc4_block::expand8to14(pPixels[i * stride])); if (err < best_index_err) { best_index_err = err; @@ -2902,7 +2901,8 @@ namespace rgbcx } // mode error_reached_zero: - return best_err >> 8; + // scale the error back to 8-bit from 8:6 fixed point (to match what was previously returned) + return (best_err + 63) >> 12; } void encode_bc3(void* pDst, const uint8_t* pPixels, uint32_t flags, uint32_t total_orderings_to_try) diff --git a/rgbcx.h b/rgbcx.h index 78d4ab4..7cd46f4 100644 --- a/rgbcx.h +++ b/rgbcx.h @@ -388,6 +388,11 @@ namespace rgbcx return (selector_bits >> (((y * 4) + x) * cBC4SelectorBits)) & (cMaxSelectorValues - 1); } + // Expands an 8-bit value to 14-bit + static inline uint32_t expand8to14(uint32_t val) { + return (val << 6) | (val >> 2); + } + // Interpolated values as 8-bit (as per BC3 alpha) static inline uint32_t get_block_values6(uint8_t* pDst, uint32_t l, uint32_t h) { @@ -402,19 +407,19 @@ namespace rgbcx return 6; } - // Interpolated values expanded to 16-bit (as per BC4/5) + // Interpolated values expanded to 14-bit (for BC4/5) static inline uint32_t get_block_values6(uint16_t* pDst, uint32_t l, uint32_t h) { - uint32_t l16 = (l << 8) | l; - uint32_t h16 = (h << 8) | h; - pDst[0] = static_cast(l16); - pDst[1] = static_cast(h16); - pDst[2] = static_cast((l16 * 4 + h16 ) / 5); - pDst[3] = static_cast((l16 * 3 + h16 * 2) / 5); - pDst[4] = static_cast((l16 * 2 + h16 * 3) / 5); - pDst[5] = static_cast((l16 + h16 * 4) / 5); + uint32_t l14 = expand8to14(l); + uint32_t h14 = expand8to14(h); + pDst[0] = static_cast(l14); + pDst[1] = static_cast(h14); + pDst[2] = static_cast((l14 * 4 + h14 ) / 5); + pDst[3] = static_cast((l14 * 3 + h14 * 2) / 5); + pDst[4] = static_cast((l14 * 2 + h14 * 3) / 5); + pDst[5] = static_cast((l14 + h14 * 4) / 5); pDst[6] = 0; - pDst[7] = 65535; + pDst[7] = static_cast(expand8to14(255)); return 6; } @@ -432,19 +437,19 @@ namespace rgbcx return 8; } - // Interpolated values expanded to 16-bit (as per BC4/5) + // Interpolated values expanded to 14-bit (for BC4/5) static inline uint32_t get_block_values8(uint16_t* pDst, uint32_t l, uint32_t h) { - uint32_t l16 = (l << 8) | l; - uint32_t h16 = (h << 8) | h; - pDst[0] = static_cast(l16); - pDst[1] = static_cast(h16); - pDst[2] = static_cast((l16 * 6 + h16 ) / 7); - pDst[3] = static_cast((l16 * 5 + h16 * 2) / 7); - pDst[4] = static_cast((l16 * 4 + h16 * 3) / 7); - pDst[5] = static_cast((l16 * 3 + h16 * 4) / 7); - pDst[6] = static_cast((l16 * 2 + h16 * 5) / 7); - pDst[7] = static_cast((l16 + h16 * 6) / 7); + uint32_t l14 = expand8to14(l); + uint32_t h14 = expand8to14(h); + pDst[0] = static_cast(l14); + pDst[1] = static_cast(h14); + pDst[2] = static_cast((l14 * 6 + h14 ) / 7); + pDst[3] = static_cast((l14 * 5 + h14 * 2) / 7); + pDst[4] = static_cast((l14 * 4 + h14 * 3) / 7); + pDst[5] = static_cast((l14 * 3 + h14 * 4) / 7); + pDst[6] = static_cast((l14 * 2 + h14 * 5) / 7); + pDst[7] = static_cast((l14 + h14 * 6) / 7); return 8; } From 72b8ea0ad5427120daf62ffcb599ecf3151028ed Mon Sep 17 00:00:00 2001 From: Carl Woffenden Date: Mon, 5 Sep 2022 15:46:45 +0200 Subject: [PATCH 3/4] Simplified 2-value search, removed 8-bit expansion --- rgbcx.cpp | 36 +++++++++++------------------------- rgbcx.h | 47 ----------------------------------------------- 2 files changed, 11 insertions(+), 72 deletions(-) diff --git a/rgbcx.cpp b/rgbcx.cpp index f51b47c..8cc0af7 100644 --- a/rgbcx.cpp +++ b/rgbcx.cpp @@ -2792,29 +2792,19 @@ namespace rgbcx return 0; } - // if we only have two values (min and max) the search radius can be set to zero (setting the endpoints directly) - bool has_two_values = true; - for (uint32_t i = 0; i < 16; i++) { - uint32_t val = pPixels[i * stride]; - if (val != min_val && val != max_val) { - has_two_values = false; - break; - } - } - if (has_two_values) { - search_rad = 0; - } - uint32_t best_err = UINT32_MAX; for (uint32_t mode = 0; mode < 2; mode++) { if ((mode_flag & (1 << mode)) == 0) continue; - for (int lo_delta = -(int)search_rad; lo_delta <= (int)search_rad; lo_delta++) + // the deltas go 0, -1, 1, -2, 2, -3, 3, etc., meaning 2-colour blocks are found first + for (int lo_count = 0; lo_count <= (int)search_rad << 1; lo_count++) { - for (int hi_delta = -(int)search_rad; hi_delta <= (int)search_rad; hi_delta++) + int lo_delta = ((lo_count & 1) ? -lo_count : lo_count) >> 1; + for (int hi_count = 0; hi_count <= (int)search_rad << 1; hi_count++) { + int hi_delta = ((hi_count & 1) ? -hi_count : hi_count) >> 1; bc4_block trial_block; trial_block.m_endpoints[0] = (uint8_t)clamp(max_val + hi_delta, 0, 255); trial_block.m_endpoints[1] = (uint8_t)clamp(min_val + lo_delta, 0, 255); @@ -2830,10 +2820,8 @@ namespace rgbcx else if (!trial_block.is_alpha6_block()) std::swap(trial_block.m_endpoints[0], trial_block.m_endpoints[1]); - // note: block vals are expanded to 8:6 fixed point, as is the error, - // with 8:6 able to accumulate 16x the worse-case error (255.98 ^ 2) - uint16_t block_vals14[8]; - trial_block.get_block_values(block_vals14, trial_block.m_endpoints[0], trial_block.m_endpoints[1]); + uint8_t block_vals[8]; + trial_block.get_block_values(block_vals, trial_block.m_endpoints[0], trial_block.m_endpoints[1]); uint32_t trial_err = 0; uint8_t trial_sels[16]; @@ -2842,9 +2830,8 @@ namespace rgbcx { memcpy(trial_sels, pForce_selectors, 16); - for (uint32_t i = 0; i < 16; i++) { - trial_err += squarei(block_vals14[pForce_selectors[i]] - bc4_block::expand8to14(pPixels[i * stride])); - } + for (uint32_t i = 0; i < 16; i++) + trial_err += squarei(block_vals[pForce_selectors[i]] - pPixels[i * stride]); } else { @@ -2854,7 +2841,7 @@ namespace rgbcx uint32_t best_index = 0; for (uint32_t j = 0; j < 8; j++) { - uint32_t err = squarei(block_vals14[j] - bc4_block::expand8to14(pPixels[i * stride])); + uint32_t err = squarei(block_vals[j] - pPixels[i * stride]); if (err < best_index_err) { best_index_err = err; @@ -2901,8 +2888,7 @@ namespace rgbcx } // mode error_reached_zero: - // scale the error back to 8-bit from 8:6 fixed point (to match what was previously returned) - return (best_err + 63) >> 12; + return best_err; } void encode_bc3(void* pDst, const uint8_t* pPixels, uint32_t flags, uint32_t total_orderings_to_try) diff --git a/rgbcx.h b/rgbcx.h index 7cd46f4..cf79392 100644 --- a/rgbcx.h +++ b/rgbcx.h @@ -388,12 +388,6 @@ namespace rgbcx return (selector_bits >> (((y * 4) + x) * cBC4SelectorBits)) & (cMaxSelectorValues - 1); } - // Expands an 8-bit value to 14-bit - static inline uint32_t expand8to14(uint32_t val) { - return (val << 6) | (val >> 2); - } - - // Interpolated values as 8-bit (as per BC3 alpha) static inline uint32_t get_block_values6(uint8_t* pDst, uint32_t l, uint32_t h) { pDst[0] = static_cast(l); @@ -407,23 +401,6 @@ namespace rgbcx return 6; } - // Interpolated values expanded to 14-bit (for BC4/5) - static inline uint32_t get_block_values6(uint16_t* pDst, uint32_t l, uint32_t h) - { - uint32_t l14 = expand8to14(l); - uint32_t h14 = expand8to14(h); - pDst[0] = static_cast(l14); - pDst[1] = static_cast(h14); - pDst[2] = static_cast((l14 * 4 + h14 ) / 5); - pDst[3] = static_cast((l14 * 3 + h14 * 2) / 5); - pDst[4] = static_cast((l14 * 2 + h14 * 3) / 5); - pDst[5] = static_cast((l14 + h14 * 4) / 5); - pDst[6] = 0; - pDst[7] = static_cast(expand8to14(255)); - return 6; - } - - // Interpolated values as 8-bit (as per BC3 alpha) static inline uint32_t get_block_values8(uint8_t* pDst, uint32_t l, uint32_t h) { pDst[0] = static_cast(l); @@ -437,22 +414,6 @@ namespace rgbcx return 8; } - // Interpolated values expanded to 14-bit (for BC4/5) - static inline uint32_t get_block_values8(uint16_t* pDst, uint32_t l, uint32_t h) - { - uint32_t l14 = expand8to14(l); - uint32_t h14 = expand8to14(h); - pDst[0] = static_cast(l14); - pDst[1] = static_cast(h14); - pDst[2] = static_cast((l14 * 6 + h14 ) / 7); - pDst[3] = static_cast((l14 * 5 + h14 * 2) / 7); - pDst[4] = static_cast((l14 * 4 + h14 * 3) / 7); - pDst[5] = static_cast((l14 * 3 + h14 * 4) / 7); - pDst[6] = static_cast((l14 * 2 + h14 * 5) / 7); - pDst[7] = static_cast((l14 + h14 * 6) / 7); - return 8; - } - static inline uint32_t get_block_values(uint8_t* pDst, uint32_t l, uint32_t h) { if (l > h) @@ -460,14 +421,6 @@ namespace rgbcx else return get_block_values6(pDst, l, h); } - - static inline uint32_t get_block_values(uint16_t* pDst, uint32_t l, uint32_t h) - { - if (l > h) - return get_block_values8(pDst, l, h); - else - return get_block_values6(pDst, l, h); - } }; } From 280fe72e3dd846e9920dfdbfd0c6a080f4382531 Mon Sep 17 00:00:00 2001 From: Carl Woffenden Date: Mon, 5 Sep 2022 16:10:40 +0200 Subject: [PATCH 4/4] Match original style --- rgbcx.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/rgbcx.cpp b/rgbcx.cpp index 8cc0af7..0b34ba1 100644 --- a/rgbcx.cpp +++ b/rgbcx.cpp @@ -2830,7 +2830,7 @@ namespace rgbcx { memcpy(trial_sels, pForce_selectors, 16); - for (uint32_t i = 0; i < 16; i++) + for (uint32_t i = 0; i < 16; i++) trial_err += squarei(block_vals[pForce_selectors[i]] - pPixels[i * stride]); } else