Skip to content

Writing ntuple larger than 2GB fails when no compression is used #1130

@grzanka

Description

@grzanka

The problem

Trying to save ntuple (TTree) with more than 2 GB of data and no compression fails with following error:

error: 'i' format requires -2147483648 <= number <= 2147483647

The minimum code to reproduce:

from pathlib import Path
import numpy as np
import uproot
data_dict = {
        "x": np.ones(1000_000_000, dtype=np.float64),
}
with uproot.recreate(Path('file.root'), compression=None) as fout:
    fout["tree"] = data_dict

Details

More details with my original code from which the problem is below. In the comments below I've also provided more examples.

I was trying to write an ROOT ntuple with following code:

import time
from pathlib import Path
import h5py
from hdf import peak_count
import uproot

def convert_hdf_to_ntuple(input_path: Path):
    ntuple_path = input_path.with_suffix('.root')
    print(f"Saving ntuple to {ntuple_path}")
    before_write = time.time()
    ntuple_path.unlink(missing_ok=True)

    uproot.create(ntuple_path, compression=None)

    file = uproot.reading.ReadOnlyFile(ntuple_path)
    print(f"file 64 bit (check via file.is_64bit) {file.is_64bit}")
    
    for channel_no in range(4):
        with h5py.File(input_path, 'r') as f, uproot.update(ntuple_path) as fout:
            print(f"Processing channel {channel_no}")
            gain_mV = f[f'channel_{channel_no}'].attrs['gain_mV']
            offset_mV = f[f'channel_{channel_no}'].attrs['offset_mV']
            horiz_interval_ns = f[f'channel_{channel_no}'].attrs['horiz_interval_ns']
            fout[f'channel_{channel_no}/gain_mV'] = str(gain_mV)
            fout[f'channel_{channel_no}/offset_mV'] = str(offset_mV)
            fout[f'channel_{channel_no}/horiz_interval_ns'] = str(horiz_interval_ns)

            peaks_in_bucket = 10000000
            for peak_type in ['positive', 'negative']:
                print(f"Processing {peak_type} peaks")
                total_number_of_peaks = peak_count(f, channel_no, peak_type)
                for i in range(0, total_number_of_peaks, peaks_in_bucket):
                    dict_bucket = {}
                    for name, dataset in f[f'channel_{channel_no}/{peak_type}'].items():
                        dict_bucket[name] = dataset[i:i + peaks_in_bucket]
                    dict_bucket['peak_value_mV'] = dict_bucket['peak_value'] * gain_mV
                    dict_bucket['peak_length_ns'] = dict_bucket['peak_length'] * horiz_interval_ns
                    dict_bucket['peak_start_us'] = dict_bucket['peak_start'] * horiz_interval_ns / 1000
                    dict_bucket['peak_cfd_us'] = dict_bucket['peak_cfd_index'] * horiz_interval_ns / 1000
                    dict_bucket['peak_rise_ns'] = dict_bucket['rise_time'] * horiz_interval_ns
                    dict_bucket['peak_area_ns_mV'] = dict_bucket['peak_area'] * horiz_interval_ns * gain_mV
                    dict_bucket['peak_baseline_mV'] = dict_bucket['peak_baseline'] * gain_mV - offset_mV
                    dict_bucket['peak_noise_mV'] = dict_bucket['peak_noise'] * gain_mV
                    dict_bucket['peak_fwhm_ns'] = dict_bucket['peak_fwhm'] * horiz_interval_ns

                    del dict_bucket['peak_value']
                    del dict_bucket['peak_length']
                    del dict_bucket['peak_area']
                    del dict_bucket['peak_cfd_index']
                    del dict_bucket['rise_time']
                    del dict_bucket['peak_baseline']
                    del dict_bucket['peak_noise']
                    del dict_bucket['peak_fwhm']
                    if i == 0:
                        fout[f'channel_{channel_no}/{peak_type}'] = dict_bucket
                    else:
                        fout[f'channel_{channel_no}/{peak_type}'].extend(dict_bucket)
                    print(f"num entries {fout[f'channel_{channel_no}/{peak_type}'].num_entries} , num baskets {fout[f'channel_{channel_no}/{peak_type}'].num_baskets}")

    after_write = time.time()
    print(f"Writing took {after_write - before_write:.3f} s")

This works nicely until files are small, say smaller than 2GB.

When trying to save larger file I get following error:

Traceback (most recent call last):
  File "/net/people/plgrid/plgkongruencj/2022-krakow-lgad/src/convert_from_lv1_to_lv2.py", line 146, in <module>
    main()
  File "/memfs/7649613/poetry_cache/virtualenvs/2022-krakow-lgad-_qGHPVZk-py3.11/lib/python3.11/site-packages/click/core.py", line 1157, in __call__
    return self.main(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/memfs/7649613/poetry_cache/virtualenvs/2022-krakow-lgad-_qGHPVZk-py3.11/lib/python3.11/site-packages/click/core.py", line 1078, in main
    rv = self.invoke(ctx)
         ^^^^^^^^^^^^^^^^
  File "/memfs/7649613/poetry_cache/virtualenvs/2022-krakow-lgad-_qGHPVZk-py3.11/lib/python3.11/site-packages/click/core.py", line 1434, in invoke
    return ctx.invoke(self.callback, **ctx.params)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/memfs/7649613/poetry_cache/virtualenvs/2022-krakow-lgad-_qGHPVZk-py3.11/lib/python3.11/site-packages/click/core.py", line 783, in invoke
    return __callback(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/net/people/plgrid/plgkongruencj/2022-krakow-lgad/src/convert_from_lv1_to_lv2.py", line 134, in main
    convert_hdf_to_ntuple(input_path)
  File "/net/people/plgrid/plgkongruencj/2022-krakow-lgad/src/root.py", line 58, in convert_hdf_to_ntuple
    fout[f'channel_{channel_no}/{peak_type}'] = dict_bucket
    ~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/memfs/7649613/poetry_cache/virtualenvs/2022-krakow-lgad-_qGHPVZk-py3.11/lib/python3.11/site-packages/uproot/writing/writable.py", line 984, in __setitem__
    self.update({where: what})
  File "/memfs/7649613/poetry_cache/virtualenvs/2022-krakow-lgad-_qGHPVZk-py3.11/lib/python3.11/site-packages/uproot/writing/writable.py", line 1555, in update
    uproot.writing.identify.add_to_directory(v, name, directory, streamers)
  File "/memfs/7649613/poetry_cache/virtualenvs/2022-krakow-lgad-_qGHPVZk-py3.11/lib/python3.11/site-packages/uproot/writing/identify.py", line 152, in add_to_directory
    tree.extend(data)
  File "/memfs/7649613/poetry_cache/virtualenvs/2022-krakow-lgad-_qGHPVZk-py3.11/lib/python3.11/site-packages/uproot/writing/writable.py", line 1834, in extend
    self._cascading.extend(self._file, self._file.sink, data)
  File "/memfs/7649613/poetry_cache/virtualenvs/2022-krakow-lgad-_qGHPVZk-py3.11/lib/python3.11/site-packages/uproot/writing/_cascadetree.py", line 816, in extend
    totbytes, zipbytes, location = self.write_np_basket(
                                   ^^^^^^^^^^^^^^^^^^^^^
  File "/memfs/7649613/poetry_cache/virtualenvs/2022-krakow-lgad-_qGHPVZk-py3.11/lib/python3.11/site-packages/uproot/writing/_cascadetree.py", line 1427, in write_np_basket
    self._freesegments.write(sink)
  File "/memfs/7649613/poetry_cache/virtualenvs/2022-krakow-lgad-_qGHPVZk-py3.11/lib/python3.11/site-packages/uproot/writing/_cascade.py", line 782, in write
    super().write(sink)
  File "/memfs/7649613/poetry_cache/virtualenvs/2022-krakow-lgad-_qGHPVZk-py3.11/lib/python3.11/site-packages/uproot/writing/_cascade.py", line 132, in write
    dependency.write(sink)
  File "/memfs/7649613/poetry_cache/virtualenvs/2022-krakow-lgad-_qGHPVZk-py3.11/lib/python3.11/site-packages/uproot/writing/_cascade.py", line 102, in write
    tmp = self.serialize()
          ^^^^^^^^^^^^^^^^
  File "/memfs/7649613/poetry_cache/virtualenvs/2022-krakow-lgad-_qGHPVZk-py3.11/lib/python3.11/site-packages/uproot/writing/_cascade.py", line 377, in serialize
    format.pack(
struct.error: 'i' format requires -2147483648 <= number <= 2147483647

I saw similar error reported long ago here: scikit-hep/uproot3#462

Also - when looking at the source code of extend method in class NTuple(CascadeNode) it seems that all calls to add_rblob are with big=False argument. which suggest that only 4-byte pointers are being used.

See:

page_key = self.add_rblob(sink, data_bytes, len(data_bytes), big=False)

This is my uproot version:

Python 3.11.3 (main, Nov 19 2023, 23:25:18) [GCC 12.3.0] on linux
Type "help", "copyright", "credits" or "license" for more information.
>>> import uproot
>>> uproot.__version__
'5.2.2'

Metadata

Metadata

Assignees

No one assigned

    Labels

    bug (unverified)The problem described would be a bug, but needs to be triaged

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions