From 457c74f7be972d84a4f13b53c3ad75ff71c9a039 Mon Sep 17 00:00:00 2001
From: Carl Woffenden <cwoffenden@gmail.com>
Date: Sun, 4 Sep 2022 18:45:33 +0200
Subject: [PATCH 1/4] BC4/5 fixes and performance improvements

This fixes #17 but goes further, since it provides higher accuracy for other blocks with few values. Two value blocks are special-cased to use the two endpoints. An early out is taken when the error reaches zero.
---
 rgbcx.cpp | 34 ++++++++++++++++++++++++++++------
 rgbcx.h   | 42 ++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 70 insertions(+), 6 deletions(-)

diff --git a/rgbcx.cpp b/rgbcx.cpp
index 609a4d4..063bbbb 100644
--- a/rgbcx.cpp
+++ b/rgbcx.cpp
@@ -2792,6 +2792,19 @@ namespace rgbcx
 			return 0;
 		}
 
+		// if we only have two values (min and max) the search radius can be set to zero (setting the endpoints directly)
+		bool has_two_values = true;
+		for (uint32_t i = 0; i < 16; i++) {
+			uint32_t val = pPixels[i * stride];
+			if (val != min_val && val != max_val) {
+				has_two_values = false;
+				break;
+			}
+		}
+		if (has_two_values) {
+			search_rad = 0;
+		}
+
 		uint32_t best_err = UINT32_MAX;
 		for (uint32_t mode = 0; mode < 2; mode++)
 		{
@@ -2817,8 +2830,9 @@ namespace rgbcx
 					else if (!trial_block.is_alpha6_block())
 						std::swap(trial_block.m_endpoints[0], trial_block.m_endpoints[1]);
 
-					uint8_t block_vals[8];
-					trial_block.get_block_values(block_vals, trial_block.m_endpoints[0], trial_block.m_endpoints[1]);
+					// note: block vals are expanded to 16-bit, as is the error
+					uint16_t block_vals16[8];
+					trial_block.get_block_values(block_vals16, trial_block.m_endpoints[0], trial_block.m_endpoints[1]);
 
 					uint32_t trial_err = 0;
 					uint8_t trial_sels[16];
@@ -2827,8 +2841,10 @@ namespace rgbcx
 					{
 						memcpy(trial_sels, pForce_selectors, 16);
 
-						for (uint32_t i = 0; i < 16; i++)
-							trial_err += squarei(block_vals[pForce_selectors[i]] - pPixels[i * stride]);
+						for (uint32_t i = 0; i < 16; i++) {
+							int val = pPixels[i * stride];
+							trial_err += squarei(block_vals16[pForce_selectors[i]] - ((val << 8) | val));
+						}
 					}
 					else
 					{
@@ -2838,7 +2854,8 @@ namespace rgbcx
 							uint32_t best_index = 0;
 							for (uint32_t j = 0; j < 8; j++)
 							{
-								uint32_t err = squarei(block_vals[j] - pPixels[i * stride]);
+								int val = pPixels[i * stride];
+								uint32_t err = squarei(block_vals16[j] - ((val << 8) | val));
 								if (err < best_index_err)
 								{
 									best_index_err = err;
@@ -2872,6 +2889,10 @@ namespace rgbcx
 						trial_block.m_selectors[5] = (uint8_t)(sel_vals >> 40);
 
 						memcpy(pDst_bytes, &trial_block, sizeof(bc4_block));
+						if (best_err == 0) {
+							// early out since we have a zero error
+							goto error_reached_zero;
+						}
 					} // if (trial_err < best_err)
 
 				} // hi_delta
@@ -2879,8 +2900,9 @@ namespace rgbcx
 			} // lo_delta
 
 		} // mode
+	error_reached_zero:
 
-		return best_err;
+		return best_err >> 8;
 	}
 
 	void encode_bc3(void* pDst, const uint8_t* pPixels, uint32_t flags, uint32_t total_orderings_to_try)
diff --git a/rgbcx.h b/rgbcx.h
index cf79392..78d4ab4 100644
--- a/rgbcx.h
+++ b/rgbcx.h
@@ -388,6 +388,7 @@ namespace rgbcx
 			return (selector_bits >> (((y * 4) + x) * cBC4SelectorBits)) & (cMaxSelectorValues - 1);
 		}
 
+		// Interpolated values as 8-bit (as per BC3 alpha)
 		static inline uint32_t get_block_values6(uint8_t* pDst, uint32_t l, uint32_t h)
 		{
 			pDst[0] = static_cast<uint8_t>(l);
@@ -401,6 +402,23 @@ namespace rgbcx
 			return 6;
 		}
 
+		// Interpolated values expanded to 16-bit (as per BC4/5)
+		static inline uint32_t get_block_values6(uint16_t* pDst, uint32_t l, uint32_t h)
+		{
+			uint32_t l16 = (l << 8) | l;
+			uint32_t h16 = (h << 8) | h;
+			pDst[0] = static_cast<uint8_t>(l16);
+			pDst[1] = static_cast<uint8_t>(h16);
+			pDst[2] = static_cast<uint8_t>((l16 * 4 + h16    ) / 5);
+			pDst[3] = static_cast<uint8_t>((l16 * 3 + h16 * 2) / 5);
+			pDst[4] = static_cast<uint8_t>((l16 * 2 + h16 * 3) / 5);
+			pDst[5] = static_cast<uint8_t>((l16     + h16 * 4) / 5);
+			pDst[6] = 0;
+			pDst[7] = 65535;
+			return 6;
+		}
+
+		// Interpolated values as 8-bit (as per BC3 alpha)
 		static inline uint32_t get_block_values8(uint8_t* pDst, uint32_t l, uint32_t h)
 		{
 			pDst[0] = static_cast<uint8_t>(l);
@@ -414,6 +432,22 @@ namespace rgbcx
 			return 8;
 		}
 
+		// Interpolated values expanded to 16-bit (as per BC4/5)
+		static inline uint32_t get_block_values8(uint16_t* pDst, uint32_t l, uint32_t h)
+		{
+			uint32_t l16 = (l << 8) | l;
+			uint32_t h16 = (h << 8) | h;
+			pDst[0] = static_cast<uint16_t>(l16);
+			pDst[1] = static_cast<uint16_t>(h16);
+			pDst[2] = static_cast<uint16_t>((l16 * 6 + h16    ) / 7);
+			pDst[3] = static_cast<uint16_t>((l16 * 5 + h16 * 2) / 7);
+			pDst[4] = static_cast<uint16_t>((l16 * 4 + h16 * 3) / 7);
+			pDst[5] = static_cast<uint16_t>((l16 * 3 + h16 * 4) / 7);
+			pDst[6] = static_cast<uint16_t>((l16 * 2 + h16 * 5) / 7);
+			pDst[7] = static_cast<uint16_t>((l16     + h16 * 6) / 7);
+			return 8;
+		}
+
 		static inline uint32_t get_block_values(uint8_t* pDst, uint32_t l, uint32_t h)
 		{
 			if (l > h)
@@ -421,6 +455,14 @@ namespace rgbcx
 			else
 				return get_block_values6(pDst, l, h);
 		}
+
+		static inline uint32_t get_block_values(uint16_t* pDst, uint32_t l, uint32_t h)
+		{
+			if (l > h)
+				return get_block_values8(pDst, l, h);
+			else
+				return get_block_values6(pDst, l, h);
+		}
 	};
 
 }

From c07f3448a7f36597539f0f95cc534cbd5b6178e4 Mon Sep 17 00:00:00 2001
From: Carl Woffenden <cwoffenden@gmail.com>
Date: Mon, 5 Sep 2022 11:06:56 +0200
Subject: [PATCH 2/4] Values expanded to 14-bit (to accumulated worse-case
 error)

As 16-bit we couldn't accumulate the worst-case error without overflowing. Also fixed a bug whereby the values6 were truncated to 8-bit, therefore mostly favouring values8. The return from encode_bc4_hq() is now scaled to the same range from before the changes.
---
 rgbcx.cpp | 16 ++++++++--------
 rgbcx.h   | 47 ++++++++++++++++++++++++++---------------------
 2 files changed, 34 insertions(+), 29 deletions(-)

diff --git a/rgbcx.cpp b/rgbcx.cpp
index 063bbbb..f51b47c 100644
--- a/rgbcx.cpp
+++ b/rgbcx.cpp
@@ -2830,9 +2830,10 @@ namespace rgbcx
 					else if (!trial_block.is_alpha6_block())
 						std::swap(trial_block.m_endpoints[0], trial_block.m_endpoints[1]);
 
-					// note: block vals are expanded to 16-bit, as is the error
-					uint16_t block_vals16[8];
-					trial_block.get_block_values(block_vals16, trial_block.m_endpoints[0], trial_block.m_endpoints[1]);
+					// note: block vals are expanded to 8:6 fixed point, as is the error,
+					// with 8:6 able to accumulate 16x the worse-case error (255.98 ^ 2)
+					uint16_t block_vals14[8];
+					trial_block.get_block_values(block_vals14, trial_block.m_endpoints[0], trial_block.m_endpoints[1]);
 
 					uint32_t trial_err = 0;
 					uint8_t trial_sels[16];
@@ -2842,8 +2843,7 @@ namespace rgbcx
 						memcpy(trial_sels, pForce_selectors, 16);
 
 						for (uint32_t i = 0; i < 16; i++) {
-							int val = pPixels[i * stride];
-							trial_err += squarei(block_vals16[pForce_selectors[i]] - ((val << 8) | val));
+							trial_err += squarei(block_vals14[pForce_selectors[i]] - bc4_block::expand8to14(pPixels[i * stride]));
 						}
 					}
 					else
@@ -2854,8 +2854,7 @@ namespace rgbcx
 							uint32_t best_index = 0;
 							for (uint32_t j = 0; j < 8; j++)
 							{
-								int val = pPixels[i * stride];
-								uint32_t err = squarei(block_vals16[j] - ((val << 8) | val));
+								uint32_t err = squarei(block_vals14[j] - bc4_block::expand8to14(pPixels[i * stride]));
 								if (err < best_index_err)
 								{
 									best_index_err = err;
@@ -2902,7 +2901,8 @@ namespace rgbcx
 		} // mode
 	error_reached_zero:
 
-		return best_err >> 8;
+		// scale the error back to 8-bit from 8:6 fixed point (to match what was previously returned)
+		return (best_err + 63) >> 12;
 	}
 
 	void encode_bc3(void* pDst, const uint8_t* pPixels, uint32_t flags, uint32_t total_orderings_to_try)
diff --git a/rgbcx.h b/rgbcx.h
index 78d4ab4..7cd46f4 100644
--- a/rgbcx.h
+++ b/rgbcx.h
@@ -388,6 +388,11 @@ namespace rgbcx
 			return (selector_bits >> (((y * 4) + x) * cBC4SelectorBits)) & (cMaxSelectorValues - 1);
 		}
 
+		// Expands an 8-bit value to 14-bit
+		static inline uint32_t expand8to14(uint32_t val) {
+			return (val << 6) | (val >> 2);
+		}
+
 		// Interpolated values as 8-bit (as per BC3 alpha)
 		static inline uint32_t get_block_values6(uint8_t* pDst, uint32_t l, uint32_t h)
 		{
@@ -402,19 +407,19 @@ namespace rgbcx
 			return 6;
 		}
 
-		// Interpolated values expanded to 16-bit (as per BC4/5)
+		// Interpolated values expanded to 14-bit (for BC4/5)
 		static inline uint32_t get_block_values6(uint16_t* pDst, uint32_t l, uint32_t h)
 		{
-			uint32_t l16 = (l << 8) | l;
-			uint32_t h16 = (h << 8) | h;
-			pDst[0] = static_cast<uint8_t>(l16);
-			pDst[1] = static_cast<uint8_t>(h16);
-			pDst[2] = static_cast<uint8_t>((l16 * 4 + h16    ) / 5);
-			pDst[3] = static_cast<uint8_t>((l16 * 3 + h16 * 2) / 5);
-			pDst[4] = static_cast<uint8_t>((l16 * 2 + h16 * 3) / 5);
-			pDst[5] = static_cast<uint8_t>((l16     + h16 * 4) / 5);
+			uint32_t l14 = expand8to14(l);
+			uint32_t h14 = expand8to14(h);
+			pDst[0] = static_cast<uint16_t>(l14);
+			pDst[1] = static_cast<uint16_t>(h14);
+			pDst[2] = static_cast<uint16_t>((l14 * 4 + h14    ) / 5);
+			pDst[3] = static_cast<uint16_t>((l14 * 3 + h14 * 2) / 5);
+			pDst[4] = static_cast<uint16_t>((l14 * 2 + h14 * 3) / 5);
+			pDst[5] = static_cast<uint16_t>((l14     + h14 * 4) / 5);
 			pDst[6] = 0;
-			pDst[7] = 65535;
+			pDst[7] = static_cast<uint16_t>(expand8to14(255));
 			return 6;
 		}
 
@@ -432,19 +437,19 @@ namespace rgbcx
 			return 8;
 		}
 
-		// Interpolated values expanded to 16-bit (as per BC4/5)
+		// Interpolated values expanded to 14-bit (for BC4/5)
 		static inline uint32_t get_block_values8(uint16_t* pDst, uint32_t l, uint32_t h)
 		{
-			uint32_t l16 = (l << 8) | l;
-			uint32_t h16 = (h << 8) | h;
-			pDst[0] = static_cast<uint16_t>(l16);
-			pDst[1] = static_cast<uint16_t>(h16);
-			pDst[2] = static_cast<uint16_t>((l16 * 6 + h16    ) / 7);
-			pDst[3] = static_cast<uint16_t>((l16 * 5 + h16 * 2) / 7);
-			pDst[4] = static_cast<uint16_t>((l16 * 4 + h16 * 3) / 7);
-			pDst[5] = static_cast<uint16_t>((l16 * 3 + h16 * 4) / 7);
-			pDst[6] = static_cast<uint16_t>((l16 * 2 + h16 * 5) / 7);
-			pDst[7] = static_cast<uint16_t>((l16     + h16 * 6) / 7);
+			uint32_t l14 = expand8to14(l);
+			uint32_t h14 = expand8to14(h);
+			pDst[0] = static_cast<uint16_t>(l14);
+			pDst[1] = static_cast<uint16_t>(h14);
+			pDst[2] = static_cast<uint16_t>((l14 * 6 + h14    ) / 7);
+			pDst[3] = static_cast<uint16_t>((l14 * 5 + h14 * 2) / 7);
+			pDst[4] = static_cast<uint16_t>((l14 * 4 + h14 * 3) / 7);
+			pDst[5] = static_cast<uint16_t>((l14 * 3 + h14 * 4) / 7);
+			pDst[6] = static_cast<uint16_t>((l14 * 2 + h14 * 5) / 7);
+			pDst[7] = static_cast<uint16_t>((l14     + h14 * 6) / 7);
 			return 8;
 		}
 

From 72b8ea0ad5427120daf62ffcb599ecf3151028ed Mon Sep 17 00:00:00 2001
From: Carl Woffenden <cwoffenden@gmail.com>
Date: Mon, 5 Sep 2022 15:46:45 +0200
Subject: [PATCH 3/4] Simplified 2-value search, removed 8-bit expansion

---
 rgbcx.cpp | 36 +++++++++++-------------------------
 rgbcx.h   | 47 -----------------------------------------------
 2 files changed, 11 insertions(+), 72 deletions(-)

diff --git a/rgbcx.cpp b/rgbcx.cpp
index f51b47c..8cc0af7 100644
--- a/rgbcx.cpp
+++ b/rgbcx.cpp
@@ -2792,29 +2792,19 @@ namespace rgbcx
 			return 0;
 		}
 
-		// if we only have two values (min and max) the search radius can be set to zero (setting the endpoints directly)
-		bool has_two_values = true;
-		for (uint32_t i = 0; i < 16; i++) {
-			uint32_t val = pPixels[i * stride];
-			if (val != min_val && val != max_val) {
-				has_two_values = false;
-				break;
-			}
-		}
-		if (has_two_values) {
-			search_rad = 0;
-		}
-
 		uint32_t best_err = UINT32_MAX;
 		for (uint32_t mode = 0; mode < 2; mode++)
 		{
 			if ((mode_flag & (1 << mode)) == 0)
 				continue;
 
-			for (int lo_delta = -(int)search_rad; lo_delta <= (int)search_rad; lo_delta++)
+			// the deltas go 0, -1, 1, -2, 2, -3, 3, etc., meaning 2-colour blocks are found first
+			for (int lo_count = 0; lo_count <= (int)search_rad << 1; lo_count++)
 			{
-				for (int hi_delta = -(int)search_rad; hi_delta <= (int)search_rad; hi_delta++)
+				int lo_delta = ((lo_count & 1) ? -lo_count : lo_count) >> 1;
+				for (int hi_count = 0; hi_count <= (int)search_rad << 1; hi_count++)
 				{
+					int hi_delta = ((hi_count & 1) ? -hi_count : hi_count) >> 1;
 					bc4_block trial_block;
 					trial_block.m_endpoints[0] = (uint8_t)clamp<int>(max_val + hi_delta, 0, 255);
 					trial_block.m_endpoints[1] = (uint8_t)clamp<int>(min_val + lo_delta, 0, 255);
@@ -2830,10 +2820,8 @@ namespace rgbcx
 					else if (!trial_block.is_alpha6_block())
 						std::swap(trial_block.m_endpoints[0], trial_block.m_endpoints[1]);
 
-					// note: block vals are expanded to 8:6 fixed point, as is the error,
-					// with 8:6 able to accumulate 16x the worse-case error (255.98 ^ 2)
-					uint16_t block_vals14[8];
-					trial_block.get_block_values(block_vals14, trial_block.m_endpoints[0], trial_block.m_endpoints[1]);
+					uint8_t block_vals[8];
+					trial_block.get_block_values(block_vals, trial_block.m_endpoints[0], trial_block.m_endpoints[1]);
 
 					uint32_t trial_err = 0;
 					uint8_t trial_sels[16];
@@ -2842,9 +2830,8 @@ namespace rgbcx
 					{
 						memcpy(trial_sels, pForce_selectors, 16);
 
-						for (uint32_t i = 0; i < 16; i++) {
-							trial_err += squarei(block_vals14[pForce_selectors[i]] - bc4_block::expand8to14(pPixels[i * stride]));
-						}
+						for (uint32_t i = 0; i < 16; i++) 
+							trial_err += squarei(block_vals[pForce_selectors[i]] - pPixels[i * stride]);
 					}
 					else
 					{
@@ -2854,7 +2841,7 @@ namespace rgbcx
 							uint32_t best_index = 0;
 							for (uint32_t j = 0; j < 8; j++)
 							{
-								uint32_t err = squarei(block_vals14[j] - bc4_block::expand8to14(pPixels[i * stride]));
+								uint32_t err = squarei(block_vals[j] - pPixels[i * stride]);
 								if (err < best_index_err)
 								{
 									best_index_err = err;
@@ -2901,8 +2888,7 @@ namespace rgbcx
 		} // mode
 	error_reached_zero:
 
-		// scale the error back to 8-bit from 8:6 fixed point (to match what was previously returned)
-		return (best_err + 63) >> 12;
+		return best_err;
 	}
 
 	void encode_bc3(void* pDst, const uint8_t* pPixels, uint32_t flags, uint32_t total_orderings_to_try)
diff --git a/rgbcx.h b/rgbcx.h
index 7cd46f4..cf79392 100644
--- a/rgbcx.h
+++ b/rgbcx.h
@@ -388,12 +388,6 @@ namespace rgbcx
 			return (selector_bits >> (((y * 4) + x) * cBC4SelectorBits)) & (cMaxSelectorValues - 1);
 		}
 
-		// Expands an 8-bit value to 14-bit
-		static inline uint32_t expand8to14(uint32_t val) {
-			return (val << 6) | (val >> 2);
-		}
-
-		// Interpolated values as 8-bit (as per BC3 alpha)
 		static inline uint32_t get_block_values6(uint8_t* pDst, uint32_t l, uint32_t h)
 		{
 			pDst[0] = static_cast<uint8_t>(l);
@@ -407,23 +401,6 @@ namespace rgbcx
 			return 6;
 		}
 
-		// Interpolated values expanded to 14-bit (for BC4/5)
-		static inline uint32_t get_block_values6(uint16_t* pDst, uint32_t l, uint32_t h)
-		{
-			uint32_t l14 = expand8to14(l);
-			uint32_t h14 = expand8to14(h);
-			pDst[0] = static_cast<uint16_t>(l14);
-			pDst[1] = static_cast<uint16_t>(h14);
-			pDst[2] = static_cast<uint16_t>((l14 * 4 + h14    ) / 5);
-			pDst[3] = static_cast<uint16_t>((l14 * 3 + h14 * 2) / 5);
-			pDst[4] = static_cast<uint16_t>((l14 * 2 + h14 * 3) / 5);
-			pDst[5] = static_cast<uint16_t>((l14     + h14 * 4) / 5);
-			pDst[6] = 0;
-			pDst[7] = static_cast<uint16_t>(expand8to14(255));
-			return 6;
-		}
-
-		// Interpolated values as 8-bit (as per BC3 alpha)
 		static inline uint32_t get_block_values8(uint8_t* pDst, uint32_t l, uint32_t h)
 		{
 			pDst[0] = static_cast<uint8_t>(l);
@@ -437,22 +414,6 @@ namespace rgbcx
 			return 8;
 		}
 
-		// Interpolated values expanded to 14-bit (for BC4/5)
-		static inline uint32_t get_block_values8(uint16_t* pDst, uint32_t l, uint32_t h)
-		{
-			uint32_t l14 = expand8to14(l);
-			uint32_t h14 = expand8to14(h);
-			pDst[0] = static_cast<uint16_t>(l14);
-			pDst[1] = static_cast<uint16_t>(h14);
-			pDst[2] = static_cast<uint16_t>((l14 * 6 + h14    ) / 7);
-			pDst[3] = static_cast<uint16_t>((l14 * 5 + h14 * 2) / 7);
-			pDst[4] = static_cast<uint16_t>((l14 * 4 + h14 * 3) / 7);
-			pDst[5] = static_cast<uint16_t>((l14 * 3 + h14 * 4) / 7);
-			pDst[6] = static_cast<uint16_t>((l14 * 2 + h14 * 5) / 7);
-			pDst[7] = static_cast<uint16_t>((l14     + h14 * 6) / 7);
-			return 8;
-		}
-
 		static inline uint32_t get_block_values(uint8_t* pDst, uint32_t l, uint32_t h)
 		{
 			if (l > h)
@@ -460,14 +421,6 @@ namespace rgbcx
 			else
 				return get_block_values6(pDst, l, h);
 		}
-
-		static inline uint32_t get_block_values(uint16_t* pDst, uint32_t l, uint32_t h)
-		{
-			if (l > h)
-				return get_block_values8(pDst, l, h);
-			else
-				return get_block_values6(pDst, l, h);
-		}
 	};
 
 }

From 280fe72e3dd846e9920dfdbfd0c6a080f4382531 Mon Sep 17 00:00:00 2001
From: Carl Woffenden <cwoffenden@gmail.com>
Date: Mon, 5 Sep 2022 16:10:40 +0200
Subject: [PATCH 4/4] Match original style

---
 rgbcx.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/rgbcx.cpp b/rgbcx.cpp
index 8cc0af7..0b34ba1 100644
--- a/rgbcx.cpp
+++ b/rgbcx.cpp
@@ -2830,7 +2830,7 @@ namespace rgbcx
 					{
 						memcpy(trial_sels, pForce_selectors, 16);
 
-						for (uint32_t i = 0; i < 16; i++) 
+						for (uint32_t i = 0; i < 16; i++)
 							trial_err += squarei(block_vals[pForce_selectors[i]] - pPixels[i * stride]);
 					}
 					else