-
Notifications
You must be signed in to change notification settings - Fork 87
Description
The problem
Trying to save ntuple (TTree) with more than 2 GB of data and no compression fails with following error:
error: 'i' format requires -2147483648 <= number <= 2147483647
The minimum code to reproduce:
from pathlib import Path
import numpy as np
import uproot
data_dict = {
"x": np.ones(1000_000_000, dtype=np.float64),
}
with uproot.recreate(Path('file.root'), compression=None) as fout:
fout["tree"] = data_dict
Details
More details with my original code from which the problem is below. In the comments below I've also provided more examples.
I was trying to write an ROOT ntuple with following code:
import time
from pathlib import Path
import h5py
from hdf import peak_count
import uproot
def convert_hdf_to_ntuple(input_path: Path):
ntuple_path = input_path.with_suffix('.root')
print(f"Saving ntuple to {ntuple_path}")
before_write = time.time()
ntuple_path.unlink(missing_ok=True)
uproot.create(ntuple_path, compression=None)
file = uproot.reading.ReadOnlyFile(ntuple_path)
print(f"file 64 bit (check via file.is_64bit) {file.is_64bit}")
for channel_no in range(4):
with h5py.File(input_path, 'r') as f, uproot.update(ntuple_path) as fout:
print(f"Processing channel {channel_no}")
gain_mV = f[f'channel_{channel_no}'].attrs['gain_mV']
offset_mV = f[f'channel_{channel_no}'].attrs['offset_mV']
horiz_interval_ns = f[f'channel_{channel_no}'].attrs['horiz_interval_ns']
fout[f'channel_{channel_no}/gain_mV'] = str(gain_mV)
fout[f'channel_{channel_no}/offset_mV'] = str(offset_mV)
fout[f'channel_{channel_no}/horiz_interval_ns'] = str(horiz_interval_ns)
peaks_in_bucket = 10000000
for peak_type in ['positive', 'negative']:
print(f"Processing {peak_type} peaks")
total_number_of_peaks = peak_count(f, channel_no, peak_type)
for i in range(0, total_number_of_peaks, peaks_in_bucket):
dict_bucket = {}
for name, dataset in f[f'channel_{channel_no}/{peak_type}'].items():
dict_bucket[name] = dataset[i:i + peaks_in_bucket]
dict_bucket['peak_value_mV'] = dict_bucket['peak_value'] * gain_mV
dict_bucket['peak_length_ns'] = dict_bucket['peak_length'] * horiz_interval_ns
dict_bucket['peak_start_us'] = dict_bucket['peak_start'] * horiz_interval_ns / 1000
dict_bucket['peak_cfd_us'] = dict_bucket['peak_cfd_index'] * horiz_interval_ns / 1000
dict_bucket['peak_rise_ns'] = dict_bucket['rise_time'] * horiz_interval_ns
dict_bucket['peak_area_ns_mV'] = dict_bucket['peak_area'] * horiz_interval_ns * gain_mV
dict_bucket['peak_baseline_mV'] = dict_bucket['peak_baseline'] * gain_mV - offset_mV
dict_bucket['peak_noise_mV'] = dict_bucket['peak_noise'] * gain_mV
dict_bucket['peak_fwhm_ns'] = dict_bucket['peak_fwhm'] * horiz_interval_ns
del dict_bucket['peak_value']
del dict_bucket['peak_length']
del dict_bucket['peak_area']
del dict_bucket['peak_cfd_index']
del dict_bucket['rise_time']
del dict_bucket['peak_baseline']
del dict_bucket['peak_noise']
del dict_bucket['peak_fwhm']
if i == 0:
fout[f'channel_{channel_no}/{peak_type}'] = dict_bucket
else:
fout[f'channel_{channel_no}/{peak_type}'].extend(dict_bucket)
print(f"num entries {fout[f'channel_{channel_no}/{peak_type}'].num_entries} , num baskets {fout[f'channel_{channel_no}/{peak_type}'].num_baskets}")
after_write = time.time()
print(f"Writing took {after_write - before_write:.3f} s")
This works nicely until files are small, say smaller than 2GB.
When trying to save larger file I get following error:
Traceback (most recent call last):
File "/net/people/plgrid/plgkongruencj/2022-krakow-lgad/src/convert_from_lv1_to_lv2.py", line 146, in <module>
main()
File "/memfs/7649613/poetry_cache/virtualenvs/2022-krakow-lgad-_qGHPVZk-py3.11/lib/python3.11/site-packages/click/core.py", line 1157, in __call__
return self.main(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/memfs/7649613/poetry_cache/virtualenvs/2022-krakow-lgad-_qGHPVZk-py3.11/lib/python3.11/site-packages/click/core.py", line 1078, in main
rv = self.invoke(ctx)
^^^^^^^^^^^^^^^^
File "/memfs/7649613/poetry_cache/virtualenvs/2022-krakow-lgad-_qGHPVZk-py3.11/lib/python3.11/site-packages/click/core.py", line 1434, in invoke
return ctx.invoke(self.callback, **ctx.params)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/memfs/7649613/poetry_cache/virtualenvs/2022-krakow-lgad-_qGHPVZk-py3.11/lib/python3.11/site-packages/click/core.py", line 783, in invoke
return __callback(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/net/people/plgrid/plgkongruencj/2022-krakow-lgad/src/convert_from_lv1_to_lv2.py", line 134, in main
convert_hdf_to_ntuple(input_path)
File "/net/people/plgrid/plgkongruencj/2022-krakow-lgad/src/root.py", line 58, in convert_hdf_to_ntuple
fout[f'channel_{channel_no}/{peak_type}'] = dict_bucket
~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/memfs/7649613/poetry_cache/virtualenvs/2022-krakow-lgad-_qGHPVZk-py3.11/lib/python3.11/site-packages/uproot/writing/writable.py", line 984, in __setitem__
self.update({where: what})
File "/memfs/7649613/poetry_cache/virtualenvs/2022-krakow-lgad-_qGHPVZk-py3.11/lib/python3.11/site-packages/uproot/writing/writable.py", line 1555, in update
uproot.writing.identify.add_to_directory(v, name, directory, streamers)
File "/memfs/7649613/poetry_cache/virtualenvs/2022-krakow-lgad-_qGHPVZk-py3.11/lib/python3.11/site-packages/uproot/writing/identify.py", line 152, in add_to_directory
tree.extend(data)
File "/memfs/7649613/poetry_cache/virtualenvs/2022-krakow-lgad-_qGHPVZk-py3.11/lib/python3.11/site-packages/uproot/writing/writable.py", line 1834, in extend
self._cascading.extend(self._file, self._file.sink, data)
File "/memfs/7649613/poetry_cache/virtualenvs/2022-krakow-lgad-_qGHPVZk-py3.11/lib/python3.11/site-packages/uproot/writing/_cascadetree.py", line 816, in extend
totbytes, zipbytes, location = self.write_np_basket(
^^^^^^^^^^^^^^^^^^^^^
File "/memfs/7649613/poetry_cache/virtualenvs/2022-krakow-lgad-_qGHPVZk-py3.11/lib/python3.11/site-packages/uproot/writing/_cascadetree.py", line 1427, in write_np_basket
self._freesegments.write(sink)
File "/memfs/7649613/poetry_cache/virtualenvs/2022-krakow-lgad-_qGHPVZk-py3.11/lib/python3.11/site-packages/uproot/writing/_cascade.py", line 782, in write
super().write(sink)
File "/memfs/7649613/poetry_cache/virtualenvs/2022-krakow-lgad-_qGHPVZk-py3.11/lib/python3.11/site-packages/uproot/writing/_cascade.py", line 132, in write
dependency.write(sink)
File "/memfs/7649613/poetry_cache/virtualenvs/2022-krakow-lgad-_qGHPVZk-py3.11/lib/python3.11/site-packages/uproot/writing/_cascade.py", line 102, in write
tmp = self.serialize()
^^^^^^^^^^^^^^^^
File "/memfs/7649613/poetry_cache/virtualenvs/2022-krakow-lgad-_qGHPVZk-py3.11/lib/python3.11/site-packages/uproot/writing/_cascade.py", line 377, in serialize
format.pack(
struct.error: 'i' format requires -2147483648 <= number <= 2147483647
I saw similar error reported long ago here: scikit-hep/uproot3#462
Also - when looking at the source code of extend method in class NTuple(CascadeNode) it seems that all calls to add_rblob are with big=False argument. which suggest that only 4-byte pointers are being used.
See:
uproot5/src/uproot/writing/_cascadentuple.py
Line 779 in 8a42e7d
| page_key = self.add_rblob(sink, data_bytes, len(data_bytes), big=False) |
This is my uproot version:
Python 3.11.3 (main, Nov 19 2023, 23:25:18) [GCC 12.3.0] on linux
Type "help", "copyright", "credits" or "license" for more information.
>>> import uproot
>>> uproot.__version__
'5.2.2'