xBRZ slice calculation from batch size for more even parallel process…

…ing. Fix texture sizes (max PSX texture size is 256*256).
Nucleoprotein · Feb 5, 2016 · 399563a · 399563a
1 parent 1ded43e
commit 399563a
Show file tree

Hide file tree

Showing 6 changed files with 68 additions and 53 deletions.
diff --git a/gpuPeteOpenGL2Tweak/GPUPatches.cpp b/gpuPeteOpenGL2Tweak/GPUPatches.cpp
@@ -282,7 +282,7 @@ void APIENTRY GPUPatches::Hook_glCopyTexSubImage2D(GLenum target, GLint level, G
 	if (target != GL_TEXTURE_2D)
 		return oglCopyTexSubImage2D(target, level, xoffset, yoffset, x, y, width, height);
 
-	if (width >= MAX_TEXTURE_X || height >= MAX_TEXTURE_Y)
+	if (width > MAX_TEXTURE_X || height > MAX_TEXTURE_Y)
 	{
 		PLUGINLOG("glCopyTexSubImage2D !!!Texture too big !!!");
 		return oglCopyTexSubImage2D(target, level, xoffset, yoffset, x, y, width, height);
@@ -295,14 +295,14 @@ void (APIENTRY* GPUPatches::oglTexSubImage2D)(GLenum target, GLint level, GLint
 void APIENTRY GPUPatches::Hook_glTexSubImage2D(GLenum target, GLint level, GLint xoffset, GLint yoffset, GLsizei width, GLsizei height, GLenum format, GLenum type, const GLvoid *pixels)
 {
 	//PLUGINLOG("glTexSubImage2D 0x%08X %d %d %d %d %d 0x%08X 0x%08X %p", target, level, xoffset, yoffset, width, height, format, type, pixels);
-	if ((target != GL_TEXTURE_2D || format != GL_RGBA || width >= MAX_TEXTURE_X || height >= MAX_TEXTURE_Y))
+	if ((target != GL_TEXTURE_2D || format != GL_RGBA || width > MAX_TEXTURE_X || height > MAX_TEXTURE_Y))
 	{
 		return oglTexSubImage2D(target, level, xoffset, yoffset, width, height, format, type, pixels);
 	}
 
 	if (format == GL_RGBA)
 	{
-		std::vector<u32> textureBuffer = s_pGPUPatches->ScaleTexture((u32*)pixels, width, height);
+		std::vector<u32>& textureBuffer = s_pGPUPatches->ScaleTexture((u32*)pixels, width, height);
 		return oglTexSubImage2D(target, level, xoffset * s_pGPUPatches->m_scale, yoffset * s_pGPUPatches->m_scale, width * s_pGPUPatches->m_scale, height * s_pGPUPatches->m_scale, format, type, textureBuffer.data());
 	}
 	return oglTexSubImage2D(target, level, xoffset, yoffset, width, height, format, type, pixels);
@@ -311,18 +311,18 @@ void APIENTRY GPUPatches::Hook_glTexSubImage2D(GLenum target, GLint level, GLint
 void (APIENTRY* GPUPatches::oglTexImage2D)(GLenum target, GLint level, GLint internalformat, GLsizei width, GLsizei height, GLint border, GLenum format, GLenum type, const void *pixels);
 void APIENTRY GPUPatches::Hook_glTexImage2D(GLenum target, GLint level, GLint internalformat, GLsizei width, GLsizei height, GLint border, GLenum format, GLenum type, const void *pixels)
 {
-	if (target == GL_TEXTURE_2D && format == GL_RGBA && width >= MAX_TEXTURE_X && height >= MAX_TEXTURE_Y)
+	if (target == GL_TEXTURE_2D && format == GL_RGBA && width > MAX_TEXTURE_X && height > MAX_TEXTURE_Y)
 		PLUGINLOG("ResHack: Detected render surface, size: %d x %d x 32bpp, %d MiB", width, height, (width * height * 4) / (1024 * 1024));
 
 	//PLUGINLOG("glTexImage2D 0x%08X %d %d %d 0x%08X 0x%08X %p", target, level, width, height, format, type, pixels);
-	if (target != GL_TEXTURE_2D || format != GL_RGBA || width >= MAX_TEXTURE_X || height >= MAX_TEXTURE_Y)
+	if (target != GL_TEXTURE_2D || format != GL_RGBA || width > MAX_TEXTURE_X || height > MAX_TEXTURE_Y)
 	{
 		return oglTexImage2D(target, level, internalformat, width, height, border, format, type, pixels);
 	}
 
 	if (format == GL_RGBA)
 	{
-		std::vector<u32> textureBuffer = s_pGPUPatches->ScaleTexture((u32*)pixels, width, height);
+		std::vector<u32>& textureBuffer = s_pGPUPatches->ScaleTexture((u32*)pixels, width, height);
 		return oglTexImage2D(target, level, internalformat, width * s_pGPUPatches->m_scale, height * s_pGPUPatches->m_scale, border, format, type, textureBuffer.data());
 	}
 	return oglTexImage2D(target, level, internalformat, width, height, border, format, type, pixels);
@@ -331,21 +331,30 @@ void APIENTRY GPUPatches::Hook_glTexImage2D(GLenum target, GLint level, GLint in
 std::vector<u32> GPUPatches::ScaleTexture(const u32* source, u32 srcWidth, u32 srcHeight)
 {
 	u32 texture_hash = XXH32(source, srcWidth * srcHeight * sizeof(u32), 0);
-	//PLUGINLOG("texture_hash: %llu", texture_hash);
 
 	if (m_texture_cache_size > 0 && texture_hash > 0 && !m_TextureCache[texture_hash].empty())
 	{
+		//PLUGINLOG("Cache HIT: %lu", texture_hash);
 		return m_TextureCache[texture_hash];
 	}
 
+	//PLUGINLOG("Cache MISS: %lu", texture_hash);
+
 	std::vector<u32> textureBuffer(srcWidth * m_scale * srcHeight * m_scale);
 
+	//calculate slice
+	int slice = (srcWidth * srcHeight) / m_batch_size;
+	if (slice > (int)(srcHeight / 2)) slice = srcHeight / 2;
+	if (slice <= 0) slice = 1;
+
+	//PLUGINLOG("slice: %lu, srcWidth: %lu, srcHeight: %lu, pixels: %lu", slice, srcWidth, srcHeight, srcWidth * srcHeight);
+
 	if ((m_fast_fbe && *locFBE) || m_force_nearest)
 	{
-		concurrency::parallel_for(0, (int)srcHeight, (int)m_slice, [&](const int& i)
+		concurrency::parallel_for(0, (int)srcHeight, slice, [&](const int& i)
 		{
 			xbrz::nearestNeighborScale(source, srcWidth, srcHeight, srcWidth * sizeof(u32), textureBuffer.data(),
-				srcWidth * m_scale, srcHeight * m_scale, srcWidth * m_scale * sizeof(u32), xbrz::SliceType::NN_SCALE_SLICE_SOURCE, i, i + m_slice);
+				srcWidth * m_scale, srcHeight * m_scale, srcWidth * m_scale * sizeof(u32), xbrz::SliceType::NN_SCALE_SLICE_SOURCE, i, i + slice);
 		}
 		);
 	}
@@ -357,17 +366,17 @@ std::vector<u32> GPUPatches::ScaleTexture(const u32* source, u32 srcWidth, u32 s
 			DePosterize(source, deposterizeBuffer.data(), srcWidth, srcHeight);
 			source = deposterizeBuffer.data();
 
-			concurrency::parallel_for(0, (int)srcHeight, (int)m_slice, [&](const int& i)
+			concurrency::parallel_for(0, (int)srcHeight, slice, [&](const int& i)
 			{
-				xbrz::scale(m_scale, source, textureBuffer.data(), srcWidth, srcHeight, xbrz::ColorFormat::ARGB, xbrz::ScalerCfg(), i, i + m_slice);
+				xbrz::scale(m_scale, source, textureBuffer.data(), srcWidth, srcHeight, xbrz::ColorFormat::ARGB, xbrz::ScalerCfg(), i, i + slice);
 			}
 			);
 		}
 		else
 		{
-			concurrency::parallel_for(0, (int)srcHeight, (int)m_slice, [&](const int& i)
+			concurrency::parallel_for(0, (int)srcHeight, slice, [&](const int& i)
 			{
-				xbrz::scale(m_scale, source, textureBuffer.data(), srcWidth, srcHeight, xbrz::ColorFormat::ARGB, xbrz::ScalerCfg(), i, i + m_slice);
+				xbrz::scale(m_scale, source, textureBuffer.data(), srcWidth, srcHeight, xbrz::ColorFormat::ARGB, xbrz::ScalerCfg(), i, i + slice);
 			}
 			);
 		}
@@ -381,7 +390,7 @@ std::vector<u32> GPUPatches::ScaleTexture(const u32* source, u32 srcWidth, u32 s
 			//PLUGINLOG("cache time size: %lu", m_TextureCacheTimestamp.size());
 			if (m_TextureCache.size() >= m_texture_cache_size)
 			{
-				//PLUGINLOG("removing hash %llu", m_TextureCacheTimestamp.front());
+				//PLUGINLOG("removing hash %lu", m_TextureCacheTimestamp.front());
 				m_TextureCache.erase(m_TextureCacheTimestamp.front());
 				m_TextureCacheTimestamp.erase(m_TextureCacheTimestamp.begin());
 				//PLUGINLOG("cache size: %lu", m_TextureCache.size());
@@ -391,7 +400,7 @@ std::vector<u32> GPUPatches::ScaleTexture(const u32* source, u32 srcWidth, u32 s
 	return std::move(textureBuffer);
 }
 
-void GPUPatches::EnableTextureScaler(u32 scale, u32 slice, bool force_nearest, bool fast_fbe, u32 texture_cache_size)
+void GPUPatches::EnableTextureScaler(u32 scale, u32 batch_size, bool force_nearest, bool fast_fbe, u32 texture_cache_size)
 {
 	if (scale <= 1)
 		return;
@@ -400,12 +409,12 @@ void GPUPatches::EnableTextureScaler(u32 scale, u32 slice, bool force_nearest, b
 	std::call_once(flag, [&]()
 	{
 		m_scale = scale;
-		m_slice = slice;
+		m_batch_size = batch_size;
 		m_force_nearest = force_nearest;
 		m_fast_fbe = fast_fbe;
 		m_texture_cache_size = texture_cache_size;
 
-		PLUGINLOG("%ux%s Texture Filter, slice %u", scale, m_force_nearest ? " Nearest Neighbour" : "BRZ", slice);
+		PLUGINLOG("%ux%s Texture Filter, BatchSize: %lu", m_scale, m_force_nearest ? " Nearest Neighbour" : "BRZ", batch_size);
 
 		MH_CreateHook(glTexImage2D, Hook_glTexImage2D, reinterpret_cast<void**>(&oglTexImage2D));
 		MH_EnableHook(glTexImage2D);

diff --git a/gpuPeteOpenGL2Tweak/GPUPatches.h b/gpuPeteOpenGL2Tweak/GPUPatches.h
@@ -1,7 +1,7 @@
 #pragma once
 
-#define MAX_TEXTURE_X 512
-#define MAX_TEXTURE_Y 512
+#define MAX_TEXTURE_X 256
+#define MAX_TEXTURE_Y 256
 
 class GPUPatches
 {
@@ -26,12 +26,12 @@ class GPUPatches
     void ResHack(u32 _x, u32 _y);
 	void FixFullscreenAspect();
 	void ApplyWindowProc(HWND hWnd);
-	void EnableTextureScaler(u32 scale, u32 slice, bool force_nearest, bool fast_fbe, u32 texture_cache_size);
+	void EnableTextureScaler(u32 scale, u32 m_batch_size, bool force_nearest, bool fast_fbe, u32 texture_cache_size);
 
 private:
     GTEData m_gtedata;
 	u32 m_scale;
-	u32 m_slice;
+	u32 m_batch_size;
 
 	bool m_force_nearest;
 	bool m_fast_fbe;

diff --git a/gpuPeteOpenGL2Tweak/Settings.def b/gpuPeteOpenGL2Tweak/Settings.def
@@ -5,21 +5,21 @@ SETTING(LongValue, s32, VSyncInterval, "Options", 0, "#Enables VSync, paramater
 SETTING(BoolValue, bool, HideCursor, "Options", false, "#Hides cursor in window");
 SETTING(BoolValue, bool, UsePEC, "Options", false, "#Use Playstation Emulation Cheater plugin");
 SETTING(BoolValue, bool, HardcoreMode, "Options", false, "#Disables PEC and emulator save states (ie. F1/F2/F3 keys)");
-SETTING(BoolValue, bool, FixFullscreenAspect, "Options", false, "#Fixes fullscreen aspect ratio");
+SETTING(BoolValue, bool, FixFullscreenAspect, "Options", false, "#Fixes fullscreen aspect ratio, default disabled because it's buggy");
 
 SETTING(LongValue, s32, WindowX, "Window", -1, "#Moves window to X position");
 SETTING(LongValue, s32, WindowY, "Window", -1, "#Moves window to Y position");
 SETTING(BoolValue, bool, WindowOnTop, "Window", false, "#Set window on-top flag");
 
-SETTING(LongValue, u32, MulX, "ResHack", 0, "#Internal resolution X multiplier, recommended max: 8");
-SETTING(LongValue, u32, MulY, "ResHack", 0, "#Internal resolution Y multiplier, recommended max: 12");
+SETTING(LongValue, u32, MulX, "ResHack", 8, "#Internal resolution X multiplier, recommended max: 8");
+SETTING(LongValue, u32, MulY, "ResHack", 8, "#Internal resolution Y multiplier, recommended max: 12");
 
-SETTING(LongValue, u32, xBRZScale, "xBRZ", 0, "#xBRZ scale, max 6");
-SETTING(LongValue, u32, SliceSize, "xBRZ", 8, "#Texture slice for scaler, different slices can provide better or worse perfomance, recommended 8");
-SETTING(BoolValue, bool, Deposterize, "xBRZ", false, "#Deposterize");
-SETTING(BoolValue, bool, FastFBE, "xBRZ", true, "#Use nearest neighbour scaler for FBE, requires Standard setting for FBE!");
+SETTING(LongValue, u32, xBRZScale, "xBRZ", 4, "#xBRZ scale, max 6");
+SETTING(LongValue, u32, BatchSize, "xBRZ", 1024, "#Number of texels in batch for xBRZ scaler");
+SETTING(BoolValue, bool, Deposterize, "xBRZ", true, "#Deposterize, usefull for games with heavy dithering (Silent Hill), NOTE: single threaded processing");
+SETTING(BoolValue, bool, FastFBE, "xBRZ", true, "#Use nearest neighbour scaler for FBE, requires \"2: Standard\" setting for FBE!");
 SETTING(BoolValue, bool, ForceNearest, "xBRZ", false, "#Forces nearest neighbour scaler");
-SETTING(LongValue, u32, TextureCacheSize, "xBRZ", 128, "#Texture Cache size (in textures count!) for upscaled textures so texture is upscaled only once - fast!");
+SETTING(LongValue, u32, TextureCacheSize, "xBRZ", 256, "#Texture Cache size (in textures count!) for upscaled textures so texture is upscaled only once - fast!");
 
 SETTING(Value, std::string, Port1, "PadPlugin", "", "#Port1 pad plugin path");
 SETTING(Value, std::string, Port2, "PadPlugin", "", "#Port2 pad plugin path");
diff --git a/gpuPeteOpenGL2Tweak/gpuPeteOpenGL2Tweak.cpp b/gpuPeteOpenGL2Tweak/gpuPeteOpenGL2Tweak.cpp
@@ -44,7 +44,7 @@ s32 Context::OnGPUinit()
 		m_gpupatches.GTEAccuracy();
 
 	u32 scale = clamp<u32>(m_config.GetxBRZScale(), 1, 6);
-	m_gpupatches.EnableTextureScaler(scale, m_config.GetSliceSize(), m_config.GetForceNearest(), m_config.GetFastFBE(), m_config.GetTextureCacheSize());
+	m_gpupatches.EnableTextureScaler(scale, m_config.GetBatchSize(), m_config.GetForceNearest(), m_config.GetFastFBE(), m_config.GetTextureCacheSize());
 
 	if (m_config.GetMulX() > 0 && m_config.GetMulY() > 0)
 		m_gpupatches.ResHack(m_config.GetMulX(), m_config.GetMulY());

diff --git a/gpuPeteOpenGL2Tweak/gte_accuracy.cpp b/gpuPeteOpenGL2Tweak/gte_accuracy.cpp
@@ -33,7 +33,7 @@ void GPUaddVertex(s16 sx, s16 sy, s64 fx, s64 fy, s64 fz)
 	{
 		gteCoords[sy + 0x800][sx + 0x800].x = fx / (std::numeric_limits<u16>::max() * 1.0f);
 		gteCoords[sy + 0x800][sx + 0x800].y = fy / (std::numeric_limits<u16>::max() * 1.0f);
-		gteCoords[sy + 0x800][sx + 0x800].z = fz / (std::numeric_limits<u16>::max() * 1.0f);
+		//gteCoords[sy + 0x800][sx + 0x800].z = fz / (std::numeric_limits<u16>::max() * 1.0f);
 	}
 }
 
@@ -67,7 +67,7 @@ bool getGteVertex(s16 sx, s16 sy, OGLVertexTag* vertex)
 		{
 			vertex->x = gteCoords[sy + 0x800][sx + 0x800].x;
 			vertex->y = gteCoords[sy + 0x800][sx + 0x800].y;
-			vertex->z = gteCoords[sy + 0x800][sx + 0x800].z;
+			//vertex->z = gteCoords[sy + 0x800][sx + 0x800].z;
 
 			return true;
 		}

diff --git a/gpuPeteOpenGL2Tweak/xBRZ/xbrz.cpp b/gpuPeteOpenGL2Tweak/xBRZ/xbrz.cpp
@@ -175,38 +175,44 @@ double distRGB(uint32_t pix1, uint32_t pix2)
 }
 
 
+#if 0
 inline
 double distYCbCr(uint32_t pix1, uint32_t pix2, double lumaWeight)
 {
-    //http://en.wikipedia.org/wiki/YCbCr#ITU-R_BT.601_conversion
-    //YCbCr conversion is a matrix multiplication => take advantage of linearity by subtracting first!
-    const int r_diff = static_cast<int>(getRed  (pix1)) - getRed  (pix2); //we may delay division by 255 to after matrix multiplication
-    const int g_diff = static_cast<int>(getGreen(pix1)) - getGreen(pix2); //
-    const int b_diff = static_cast<int>(getBlue (pix1)) - getBlue (pix2); //substraction for int is noticeable faster than for double!
-
-    //const double k_b = 0.0722; //ITU-R BT.709 conversion
-    //const double k_r = 0.2126; //
-    const double k_b = 0.0593; //ITU-R BT.2020 conversion
-    const double k_r = 0.2627; //
-    const double k_g = 1 - k_b - k_r;
-
-    const double scale_b = 0.5 / (1 - k_b);
-    const double scale_r = 0.5 / (1 - k_r);
-
-    const double y   = k_r * r_diff + k_g * g_diff + k_b * b_diff; //[!], analog YCbCr!
-    const double c_b = scale_b * (b_diff - y);
-    const double c_r = scale_r * (r_diff - y);
-
-    //we skip division by 255 to have similar range like other distance functions
-    return std::sqrt(square(lumaWeight * y) + square(c_b) + square(c_r));
+	//http://en.wikipedia.org/wiki/YCbCr#ITU-R_BT.601_conversion
+	//YCbCr conversion is a matrix multiplication => take advantage of linearity by subtracting first!
+	const int r_diff = static_cast<int>(getRed(pix1)) - getRed(pix2); //we may delay division by 255 to after matrix multiplication
+	const int g_diff = static_cast<int>(getGreen(pix1)) - getGreen(pix2); //
+	const int b_diff = static_cast<int>(getBlue(pix1)) - getBlue(pix2); //substraction for int is noticeable faster than for double!
+
+	//const double k_b = 0.0722; //ITU-R BT.709 conversion
+	//const double k_r = 0.2126; //
+	const double k_b = 0.0593; //ITU-R BT.2020 conversion
+	const double k_r = 0.2627; //
+	const double k_g = 1 - k_b - k_r;
+
+	const double scale_b = 0.5 / (1 - k_b);
+	const double scale_r = 0.5 / (1 - k_r);
+
+	const double y = k_r * r_diff + k_g * g_diff + k_b * b_diff; //[!], analog YCbCr!
+	const double c_b = scale_b * (b_diff - y);
+	const double c_r = scale_r * (r_diff - y);
+
+	//we skip division by 255 to have similar range like other distance functions
+	return std::sqrt(square(lumaWeight * y) + square(c_b) + square(c_r));
 }
+#endif // 0
+
 
 
 struct DistYCbCrBuffer //30% perf boost compared to distYCbCr()!
 {
 public:
     static double dist(uint32_t pix1, uint32_t pix2)
     {
+#if defined _MSC_VER && _MSC_VER < 1900
+#error function scope static initialization is not yet thread-safe!
+#endif
         static const DistYCbCrBuffer inst;
         return inst.distImpl(pix1, pix2);
     }