Merge pull request #8 from quantifyearth/mwd-fix-chunk-impact-on-sum-…

…float32 Fix issue with sum on float32 arrays being impacted by chunk size.
quantifyearth · Nov 6, 2024 · 75ff7be · 75ff7be
2 parents 2f6479c + a887e07
commit 75ff7be
Show file tree

Hide file tree

Showing 4 changed files with 64 additions and 7 deletions.
diff --git a/tests/helpers.py b/tests/helpers.py
@@ -65,9 +65,37 @@ def gdal_dataset_of_layer(layer: YirgacheffeLayer, filename: Optional[str]=None)
     dataset.SetProjection(layer.projection)
     return dataset
 
+def numpy_to_gdal_type(val: np.array) -> int:
+    match val.dtype:
+        case np.float32:
+            return gdal.GDT_Float32
+        case np.float64:
+            return gdal.GDT_Float64
+        case np.int8:
+            return gdal.GDT_Byte
+        case np.int16:
+            return gdal.GDT_Int16
+        case np.int32:
+            return gdal.GDT_Int32
+        case np.int64:
+            return gdal.GDT_Int64
+        case np.uint8:
+            return gdal.GDT_Byte
+        case np.uint16:
+            return gdal.GDT_UInt16
+        case np.uint32:
+            return gdal.GDT_UInt32
+        case np.uint64:
+            return gdal.GDT_UInt64
+        case _:
+            raise ValueError
+
+
 def gdal_dataset_with_data(origin: Tuple, pixel_pitch: float, data: np.array, filename: Optional[str]=None) -> gdal.Dataset:
     assert data.ndim == 2
-    datatype = gdal.GDT_Byte
+
+    datatype = numpy_to_gdal_type(data)
+
     if isinstance(data[0][0], float):
         datatype = gdal.GDT_Float64
     if filename:
@@ -220,4 +248,4 @@ def make_vectors_with_empty_feature(areas: Set[Tuple[Area,int]], filename: str)
     feature = ogr.Feature(feature_definition)
     layer.CreateFeature(feature)
 
-    package.Destroy()
+    package.Destroy()
diff --git a/tests/test_operators.py b/tests/test_operators.py
@@ -1,10 +1,13 @@
 import os
+import random
 import tempfile
 
 import numpy as np
 import pytest
+from osgeo import gdal
 
 from helpers import gdal_dataset_with_data
+import yirgacheffe
 from yirgacheffe.layers import RasterLayer, ConstantLayer
 from yirgacheffe.operators import LayerOperation
 
@@ -491,3 +494,26 @@ def test_write_mulitband_raster() -> None:
             actual = layer.read_array(0, 0, 4, 2)
 
             assert (expected == actual).all()
+
+def test_sum_float32(monkeypatch) -> None:
+
+    random.seed(42)
+    data = []
+    for _ in range(10):
+        row = []
+        for _ in range(10):
+            row.append(random.random())
+        data.append(row)
+
+    data1 = np.array(data, dtype=np.float32)
+    layer1 = RasterLayer(gdal_dataset_with_data((0.0, 0.0), 0.02, data1))
+    assert layer1.datatype == gdal.GDT_Float32
+
+    # Sum forces things to float64
+    expected = np.sum(data1.astype(np.float64))
+
+    with monkeypatch.context() as m:
+        for blocksize in range(1,11):
+            m.setattr(yirgacheffe.constants, "YSTEP", blocksize)
+            actual = layer1.sum()
+            assert expected == actual
diff --git a/yirgacheffe/constants.py b/yirgacheffe/constants.py
@@ -0,0 +1 @@
+YSTEP = 512
diff --git a/yirgacheffe/operators.py b/yirgacheffe/operators.py
@@ -9,12 +9,10 @@
 from osgeo import gdal
 from dill import dumps, loads
 
+from . import constants
 from .window import Window
 
 
-
-YSTEP = 512
-
 class LayerConstant:
     def __init__(self, val):
         self.val = val
@@ -93,7 +91,7 @@ def max(self):
 class LayerOperation(LayerMathMixin):
 
     def __init__(self, lhs, operator=None, rhs=None):
-        self.ystep = YSTEP
+        self.ystep = constants.YSTEP
 
         if lhs is None:
             raise ValueError("LHS on operation should not be none")
@@ -162,14 +160,18 @@ def _eval(self, index, step): # pylint: disable=W0221
             return self.operator(lhs_data)
 
     def sum(self):
+        # The result accumulator is float64, and for precision reasons
+        # we force the sum to be done in float64 also. Otherwise we
+        # see variable results depending on chunk size, as different parts
+        # of the sum are done in different types.
         res = 0.0
         computation_window = self.window
         for yoffset in range(0, computation_window.ysize, self.ystep):
             step=self.ystep
             if yoffset+step > computation_window.ysize:
                 step = computation_window.ysize - yoffset
             chunk = self._eval(yoffset, step)
-            res += np.sum(chunk)
+            res += np.sum(chunk.astype(np.float64))
         return res
 
     def min(self):