mhvk · mhvk · Jul 27, 2025 · Jul 22, 2025 · Jul 14, 2025 · Jul 25, 2025
diff --git a/CHANGES.rst b/CHANGES.rst
@@ -1,6 +1,9 @@
 4.4 (unreleased)
 ================
 
+- The dada reader now supports reading Meerkat MKBF files, which store
+  data in heaps of 256 samples each (with the samples along the
+  fastest axis). [#542]
 
 4.3.1 (unreleased)
 ==================

diff --git a/baseband/dada/header.py b/baseband/dada/header.py
@@ -62,46 +62,45 @@ class DADAHeader(OrderedDict):
                    'time')
     """Properties accessible/usable in initialisation for all headers."""
 
-    _defaults = [('HEADER', 'DADA'),
-                 ('HDR_VERSION', '1.0'),
-                 ('HDR_SIZE', 4096),
-                 ('DADA_VERSION', '1.0'),
-                 ('OBS_ID', 'unset'),
-                 ('PRIMARY', 'unset'),
-                 ('SECONDARY', 'unset'),
-                 ('FILE_NAME', 'unset'),
-                 ('FILE_NUMBER', 0),
-                 ('FILE_SIZE', 0),
-                 ('OBS_OFFSET', 0),
-                 ('OBS_OVERLAP', 0),
-                 ('SOURCE', 'unset'),
-                 ('TELESCOPE', 'unset'),
-                 ('INSTRUMENT', 'unset'),
-                 ('RECEIVER', 'unset'),
-                 ('NBIT', 8),
-                 ('NDIM', 1),
-                 ('NPOL', 1),
-                 ('NCHAN', 1),
-                 ('RESOLUTION', 1),
-                 ('DSB', 1)]
+    _defaults = {
+        'HEADER': 'DADA',
+        'HDR_VERSION': '1.0',
+        'HDR_SIZE': 4096,
+        'DADA_VERSION': '1.0',
+        'OBS_ID': 'unset',
+        'PRIMARY': 'unset',
+        'SECONDARY': 'unset',
+        'FILE_NAME': 'unset',
+        'FILE_NUMBER': 0,
+        'FILE_SIZE': 0,
+        'OBS_OFFSET': 0,
+        'OBS_OVERLAP': 0,
+        'SOURCE': 'unset',
+        'TELESCOPE': 'unset',
+        'INSTRUMENT': 'unset',
+        'RECEIVER': 'unset',
+        'NBIT': 8,
+        'NDIM': 1,
+        'NPOL': 1,
+        'NCHAN': 1,
+        'RESOLUTION': 1,
+        'DSB': 1,
+    }
 
     def __init__(self, *args, verify=True, mutable=True, **kwargs):
         self.mutable = True
         self.comments = {}
         if len(args) == 1 and isinstance(args[0], str):
-            args = (self._fromlines(args[0].split('\n')),)
-
-        super().__init__(*args, **kwargs)
+            super().__init__(**self._fromlines(args[0].split('\n')), **kwargs)
+        else:
+            super().__init__(*args, **kwargs)
         self.mutable = mutable
         if verify and (args or kwargs):
             self.verify()
 
     def verify(self):
         """Basic check of integrity."""
-        assert self['HEADER'] == 'DADA'
-        assert all(key in self for key in ('HDR_VERSION',
-                                           'HDR_SIZE',
-                                           'DADA_VERSION'))
+        assert len(set(self.keys()).intersection(self._defaults.keys())) > 10
 
     def copy(self):
         """Create a mutable and independent copy of the header."""
@@ -118,7 +117,7 @@ def __copy__(self):
     @staticmethod
     def _fromlines(lines):
         """Interpret a list of lines as a header, converting its values."""
-        args = []
+        kwargs = {}
         for line_no, line in enumerate(lines):
             split = line.strip().split('#')
             comment = split[1].strip() if (len(split) > 1
@@ -135,9 +134,9 @@ def _fromlines(lines):
             elif key in ('FREQ', 'BW', 'TSAMP'):
                 value = float(value)
 
-            args.append((key, (value, comment)))
+            kwargs[key] = (value, comment)
 
-        return args
+        return kwargs
 
     def _tolines(self):
         """Write header to a list of strings."""
@@ -198,7 +197,7 @@ def fromfile(cls, fh, verify=True):
         else:
             fh.seek(start_pos + hdr_size)
 
-        return cls(cls._fromlines(lines), verify=verify, mutable=False)
+        return cls(**cls._fromlines(lines), verify=verify, mutable=False)
 
     def tofile(self, fh):
         """Write DADA file header to filehandle.
@@ -248,7 +247,7 @@ def fromvalues(cls, **kwargs):
 
         Furthermore, some header defaults are set in ``DADAHeader._defaults``.
         """
-        self = cls(cls._defaults, verify=False)
+        self = cls(**cls._defaults, verify=False)
         self.update(**kwargs)
         return self
 
@@ -400,11 +399,16 @@ def offset(self, offset):
     @property
     def start_time(self):
         """Start time of the observation."""
-        mjd_int, frac = self['MJD_START'].split('.')
-        mjd_int = int(mjd_int)
-        frac = float('.' + frac)
-        # replace '-' between date and time with a 'T' and convert to Time
-        return Time(mjd_int, frac, scale='utc', format='mjd')
+        if "MJD_START" in self:
+            mjd_int, frac = self['MJD_START'].split('.')
+            mjd_int = int(mjd_int)
+            frac = float('.' + frac)
+            return Time(mjd_int, frac, scale='utc', format='mjd')
+        else:
+            # replace '-' between date and time with a 'T' and convert to Time
+            t0 = self['UTC_START']
+            t0 = t0[:10] + 'T' + t0[11:]
+            return Time(t0, scale='utc', format='isot')
 
     @start_time.setter
     def start_time(self, start_time):

diff --git a/baseband/dada/payload.py b/baseband/dada/payload.py
@@ -7,7 +7,7 @@
 from ..base.payload import PayloadBase
 
 
-__all__ = ['DADAPayload']
+__all__ = ['DADAPayload', "MKBFPayload"]
 
 
 def decode_8bit(words):
@@ -43,3 +43,47 @@ class DADAPayload(PayloadBase):
         8: encode_8bit}
     _memmap = True
     _sample_shape_maker = namedtuple('SampleShape', 'npol, nchan')
+
+    def __new__(cls, words, *, header=None, **kwargs):
+        # Override instrument if header is given.
+        if header is not None and header.get("INSTRUMENT") == "MKBF":
+            cls = MKBFPayload
+        return super().__new__(cls)
+
+
+class MKBFPayload(DADAPayload):
+    """Container for decoding and encoding MKBF DADA payloads.
+
+    Subclass of `~baseband.dada.DADAPayload` that takes into account
+    that the samples are organized in heaps of 256 samples, with order
+    (nheap, npol, nchan, nsub=256).
+
+    Some information on the instrument writing it can be found in
+    `Van der Byl et al. 2021 <https://doi.org/10.1117/1.JATIS.8.1.011006>`_
+
+    """
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        # Change view of words to indicate one element containts 256
+        # samples for each part of the sample_shape (this also ensures
+        # bpw=bits-per-word is correct in _item_to_slices).
+        self._dtype_word = np.dtype(
+            [("heaps",
+              [("real", f"int{self.bps}"), ("imag", f"int{self.bps}")],
+              self.sample_shape + (256,))])
+        self.words = self.words.view(self._dtype_word)
+
+    def _decode(self, words, data_slice=()):
+        words = np.moveaxis(words["heaps"], -1, 1).ravel().reshape(
+            -1, *self.sample_shape)[data_slice]
+        return super()._decode(words)
+
+    def _encode(self, data):
+        data = np.moveaxis(data.reshape(-1, 256, *self.sample_shape), 1, -1)
+        return super()._encode(data)
+
+    def __getitem__(self, item=()):
+        words_slice, data_slice = self._item_to_slices(item)
+        return self._decode(self.words[words_slice], data_slice)
+
+    data = property(__getitem__, doc="Full decoded payload.")
diff --git a/baseband/dada/tests/test_dada.py b/baseband/dada/tests/test_dada.py
@@ -6,11 +6,14 @@
 import pytest
 import numpy as np
 import astropy.units as u
+from numpy.testing import assert_array_equal
 from astropy.time import Time
 from ... import dada
 from ...helpers import sequentialfile as sf
 from ..base import DADAFileNameSequencer
-from ...data import SAMPLE_DADA as SAMPLE_FILE, SAMPLE_MEERKAT_DADA
+from ..payload import decode_8bit
+from ...data import (
+    SAMPLE_DADA as SAMPLE_FILE, SAMPLE_MEERKAT_DADA, SAMPLE_MKBF_DADA)
 
 
 class TestDADA:
@@ -810,3 +813,64 @@ def test_meerkat_data():
     with dada.open(SAMPLE_MEERKAT_DADA, 'rs') as fh:
         data = fh.read()
     assert data.shape == (16384 - 4096 // 2, 2)
+
+
+class TestMKBF:
+    def test_header(self):
+        # Check that header can be read and time is correct.
+        with dada.open(SAMPLE_MKBF_DADA, 'rb') as fh:
+            header = fh.read_header()
+        assert header.sample_shape == (2, 1024)
+        assert header["NPOL"] == 2
+        assert header["NCHAN"] == 1024
+        assert header.start_time == Time("2023-07-19T15:24:04")
+
+    def test_data(self):
+        with dada.open(SAMPLE_MKBF_DADA, 'rs') as fh:
+            data = fh.read()
+            fh.seek(10)
+            d10 = fh.read(1)
+        assert_array_equal(d10, data[10:11])
+
+        with open(SAMPLE_MKBF_DADA, 'rb') as fh:
+            fh.seek(4096)
+            raw_words = np.frombuffer(fh.read(-1), dtype="u1")
+
+        pd = decode_8bit(raw_words).view("c8").reshape(2, 1024, 256)
+        check = np.moveaxis(pd, -1, 0).reshape(data.shape)
+        assert_array_equal(check, data)
+
+    @pytest.mark.parametrize("nheap", [1, 3, 6])
+    def test_writing(self, nheap, tmpdir):
+        with dada.open(SAMPLE_MKBF_DADA, 'rs') as fh:
+            header = fh.header0
+            data = fh.read()
+        test_file = str(tmpdir.join('test_mkbf.dada'))
+        # swap real and imaginary.
+        other_data = data.view("f4")[..., ::-1].copy().view("c8")
+        assert not np.all(other_data == data)
+        new_header = header.copy()
+        new_header.payload_nbytes *= nheap
+        with dada.open(test_file, "ws", header0=new_header) as fw:
+            fw.write(data)
+            fw.write(other_data)
+            fw.write(other_data[:200])
+            fw.write(data[200:])
+            fw.write(np.concatenate([data, other_data, data]))
+
+        with dada.open(test_file, "rs") as fr:
+            assert fr.header0 == new_header
+            out = fr.read()
+            if nheap == 6:
+                assert fr._last_header == new_header
+            else:
+                assert fr._last_header != new_header
+
+        assert out.shape == (6*256, 2, 1024)
+        assert_array_equal(out[:256], data)
+        assert_array_equal(out[256:512], other_data)
+        assert_array_equal(out[512:712], other_data[:200])
+        assert_array_equal(out[712:768], data[200:])
+        assert_array_equal(out[768:1024], data)
+        assert_array_equal(out[1024:1280], other_data)
+        assert_array_equal(out[1280:], data)
diff --git a/baseband/data/__init__.py b/baseband/data/__init__.py
@@ -112,6 +112,13 @@ def _full_path(name, dirname=_path.dirname(_path.abspath(__file__))):
 with many ctrl-0.
 """
 
+SAMPLE_MKBF_DADA = _full_path('sample_mkbf.dada')
+"""DADA sample in custom Meerkat MKBF format, adapted to shortened size.
+
+The sample does not have the standard "HEADER" keyword, and stores data
+in chunks of 256 samples (hardcoded, not inferable from header).
+"""
+
 SAMPLE_PUPPI = _full_path('sample_puppi.raw')
 """GUPPI/PUPPI sample, npol=2, nchan=4.
 

diff --git a/baseband/data/sample_mkbf.dada b/baseband/data/sample_mkbf.dada
diff --git a/docs/dada/index.rst b/docs/dada/index.rst
@@ -11,6 +11,12 @@ single :term:`data frame` consisting of an ASCII :term:`header` of typically
 <http://psrdada.sourceforge.net/manuals/Specification.pdf>`_ and
 actual usage; files are described by an :ref:`ASCII header <dada_header>`.
 
+.. note::
+   Since the specification is also by actual usage, some files are not
+   immediately readable (e.g., in Meerkat's MKBF files, the samples are not
+   stored consecutively, but in heaps).  Please raise an issue if you
+   encounter problems, ideally with links to format definitions.
+
 .. _dada_usage:
 
 Usage