Skip to content

Commit

Permalink
Merge pull request #6 from stegro/master
Browse files Browse the repository at this point in the history
PR for new function h5delete and autochunk feature
  • Loading branch information
tmullins committed Nov 16, 2015
2 parents ae141b6 + 3488b7b commit 364310f
Show file tree
Hide file tree
Showing 5 changed files with 219 additions and 33 deletions.
18 changes: 5 additions & 13 deletions README
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ hdf5oct - a HDF5 wrapper for GNU Octave

Copyright 2012 Tom Mullins
Copyright 2015 Tom Mullins, Anton Starikov, Thorsten Liebig, Stefan Großhauser
Copyright 2008-2013 Andrew Collette

This is a library for GNU Octave for reading hdf5 files. At the moment it
provides the following functions:
Expand All @@ -27,6 +28,8 @@ provides the following functions:
h5create: Create a dataset and specify its extent dimensions,
datatype and chunk size.

h5delete: Delete a group, dataset, or attribute.

Note that only few of the HDF5 datatypes are supported by each of the
functions hdf5oct at the moment, typically one or several of double,
integer and string.
Expand Down Expand Up @@ -63,22 +66,11 @@ To uninstall the package you may want to use

- write h5info, h5disp

- support compression flags
- support compression flags for h5create

- read string typed datasets

- read string-array typed attributes

- maybe use hdf5's c++ interface instead of the c interace.

- write more comprehensive tests instead of a few random choices. Also
test for error conditions.

- make sure Fortran/C ordering is handled in a matlab compatible manner

For integration into octave core:

- make it really clean

- guess chunksize like
https://github.com/h5py/h5py/blob/master/h5py/_hl/filters.py#L257
test for error conditions.
225 changes: 207 additions & 18 deletions h5read.cc
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
/*
*
* Copyright (C) 2012 Tom Mullins
* Copyright (C) 2015 Tom Mullins, Thorsten Liebig, Stefan Großhauser
*
* Copyright (C) 2015 Tom Mullins, Thorsten Liebig, Anton Starikov, Stefan Großhauser
* Copyright (C) 2008-2013 Andrew Collette
*
* This file is part of hdf5oct.
*
Expand Down Expand Up @@ -420,6 +420,8 @@ The vector @var{size} may contain one or several Inf (or \n\
equivalently: zero) values.\n\
This will lead to unlimited maximum extent of the dataset in the\n\
respective dimensions and 0 initial extent.\n\
Note that any dataset with at least one unlimited dimension must be chunked and\n\
it is generally recommended for large datasets.\n\
\n\
The list of @var{key}, @var{val} arguments allows to specify\n\
certain properties of the dataset. Allowed settings are:\n\
Expand All @@ -429,9 +431,11 @@ certain properties of the dataset. Allowed settings are:\n\
one of the strings @samp{double} @samp{single} @samp{uint64} @samp{uint32} @samp{uint16} @samp{uint8} @samp{int64} @samp{int32} @samp{int16} @samp{int8} \n\
\n\
@item @option{ChunkSize}\n\
a vector specifying the chunk size. Note that any\n\
dataset with an unlimited dimension must be chunked.\n\
The default value is an empty vector [], which means no chunking.\n\
The value may be either a vector specifying the chunk size,\n\
or an empty vector [], which means no chunking (this is the default),\n\
or the string @samp{auto} which makes the library choose automatically \n\
an appropriate chunk size, as best as it can. Note that the @samp{auto}\n\
setting is not @sc{matlab} compatible.\n\
@end table\n\
\n\
@seealso{h5write}\n\
Expand Down Expand Up @@ -485,7 +489,18 @@ The default value is an empty vector [], which means no chunking.\n\
}
else if (args(i).string_value () == "ChunkSize")
{
if (! check_vec (args(i+1), chunksize, "ChunkSize", false))
if (args(i+1).is_string ())
{
if(args(i+1).string_value () != "auto")
{
error ("ChunkSize argument must be either a vector, or the string 'auto'.");
return octave_value_list ();
}
chunksize = args(2).matrix_value ();
chunksize(0) = 0;

}
else if (! check_vec (args(i+1), chunksize, "ChunkSize", false))
return octave_value_list ();
}
else
Expand All @@ -507,6 +522,68 @@ The default value is an empty vector [], which means no chunking.\n\
#endif
}


DEFUN_DLD (h5delete, args, nargout,
"-*- texinfo -*-\n\
@deftypefn {Loadable Function} h5delete (@var{filename}, @var{objname})\n\
@deftypefnx {Loadable Function} h5delete (@var{filename}, @var{objname}, @var{attname})\n\
\n\
In the first form, delete a dataset or group with name @var{objname}\n\
in the HDF5 file specified by @var{filename}.\n\
\n\
In the second form, delete an attribute with name @var{attname} associated\n\
to a dataset or group with name @var{objname}\n\
in the HDF5 file specified by @var{filename}.\n\
\n\
Note that this function is not @sc{matlab} compliant.\n\
\n\
@seealso{h5create}\n\
@end deftypefn")
{
#if ! (defined (HAVE_HDF5) && defined (HAVE_HDF5_18))
gripe_disabled_feature("h5delete", "HDF5 IO");
return octave_value_list ();
#else
int nargin = args.length ();

if (! (nargin == 2 || nargin == 3) || nargout != 0)
{
print_usage ();
return octave_value_list ();
}
if (! (args(0).is_string () && args(1).is_string ()))
{
print_usage ();
return octave_value_list ();
}
if (nargin == 3 && ! args(2).is_string ())
{
print_usage ();
return octave_value_list ();
}

string filename = args(0).string_value ();
string location = args(1).string_value ();
if (error_state)
return octave_value_list ();

//open the hdf5 file
H5File file (filename.c_str (), true);
if (error_state)
return octave_value_list ();
if (nargin == 2)
file.delete_link (location.c_str ());
else if (nargin == 3)
{
string attname = args(2).string_value ();
if(!error_state)
file.delete_att (location.c_str (), attname.c_str ());
}

return octave_value_list ();
#endif
}

#if defined (HAVE_HDF5) && defined (HAVE_HDF5_18)

H5File::H5File (const char *filename, const bool create_if_nonexisting)
Expand Down Expand Up @@ -788,7 +865,7 @@ H5File::read_dset ()
size_t rdcc_nbytes = -1; \
double rdcc_w0 = -1; \
if (H5Pget_cache (H5Fget_access_plist (file), &mdc_nelem, \
&rdcc_nelem, &rdcc_nbytes, &rdcc_w0 ) < 0) \
&rdcc_nelem, &rdcc_nbytes, &rdcc_w0 ) < 0) \
{ \
error ("could not determine raw data chunk cache parameters."); \
return octave_value_list (); \
Expand Down Expand Up @@ -1385,28 +1462,59 @@ You have to save real and imag part separately.");

void
H5File::create_dset (const char *location, const Matrix& size,
const char *datatype, const Matrix& chunksize)
const char *datatype, Matrix& chunksize)
{
int typesize;
if (strcmp (datatype,"double") == 0)
type_id = H5Tcopy (H5T_NATIVE_DOUBLE);
{
type_id = H5Tcopy (H5T_NATIVE_DOUBLE);
typesize = sizeof(double);
}
else if (strcmp (datatype,"single") == 0)
type_id = H5Tcopy (H5T_NATIVE_FLOAT);
{
type_id = H5Tcopy (H5T_NATIVE_FLOAT);
typesize = sizeof(float);
}
else if (strcmp (datatype,"uint64") == 0)
type_id = H5Tcopy (H5T_STD_U64LE);
{
type_id = H5Tcopy (H5T_STD_U64LE);
typesize = 64/8;
}
else if (strcmp (datatype,"uint32") == 0)
type_id = H5Tcopy (H5T_STD_U32LE);
{
type_id = H5Tcopy (H5T_STD_U32LE);
typesize = 32/8;
}
else if (strcmp (datatype,"uint16") == 0)
type_id = H5Tcopy (H5T_STD_U16LE);
{
type_id = H5Tcopy (H5T_STD_U16LE);
typesize = 16/8;
}
else if (strcmp (datatype,"uint8") == 0)
type_id = H5Tcopy (H5T_STD_U8LE);
{
type_id = H5Tcopy (H5T_STD_U8LE);
typesize = 8/8;
}
else if (strcmp (datatype,"int64") == 0)
type_id = H5Tcopy (H5T_STD_I64LE);
{
type_id = H5Tcopy (H5T_STD_I64LE);
typesize = 64/8;
}
else if (strcmp (datatype,"int32") == 0)
type_id = H5Tcopy (H5T_STD_I32LE);
{
type_id = H5Tcopy (H5T_STD_I32LE);
typesize = 32/8;
}
else if (strcmp (datatype,"int16") == 0)
type_id = H5Tcopy (H5T_STD_I16LE);
{
type_id = H5Tcopy (H5T_STD_I16LE);
typesize = 16/8;
}
else if (strcmp (datatype,"int8") == 0)
type_id = H5Tcopy (H5T_STD_I8LE);
{
type_id = H5Tcopy (H5T_STD_I8LE);
typesize = 8/8;
}
else
{
error ("invalid datatype %s for dataset %s",datatype,location);
Expand All @@ -1431,6 +1539,9 @@ H5File::create_dset (const char *location, const Matrix& size,
if (! chunksize.is_empty ())
{
// a dataset with an unlimited dimension must be chunked.
if (chunksize(0) == 0)
chunksize = get_auto_chunksize(size, typesize);

hsize_t *dims_chunk = alloc_hsize (chunksize, ALLOC_HSIZE_DEFAULT, true);
if (H5Pset_layout (crp_list, H5D_CHUNKED) < 0)
{
Expand All @@ -1456,4 +1567,82 @@ H5File::create_dset (const char *location, const Matrix& size,

}

void
H5File::delete_link (const char *location)
{
herr_t status = H5Ldelete (file, location, H5P_DEFAULT);
if (status < 0)
{
error ("Error when deleting object %s", location);
return;
}
}


void
H5File::delete_att (const char *location, const char *att_name)
{
herr_t status = H5Adelete_by_name (file,location,att_name,H5P_DEFAULT);
if (status < 0)
{
error ("Error when deleting attribute %s of object %s", att_name, location);
return;
}
}


Matrix
H5File::get_auto_chunksize(const Matrix& dset_shape, int typesize)
{
// This function originally stems from the h5py project.

// Guess an appropriate chunk layout for a dataset, given its shape and
// the size of each element in bytes. Will allocate chunks only as large
// as MAX_SIZE. Chunks are generally close to some power-of-2 fraction of
// each axis, slightly favoring bigger values for the last index.
const int CHUNK_BASE = 16*1024; // Multiplier by which chunks are adjusted
const int CHUNK_MIN = 8*1024; //Soft lower limit (8k)
const int CHUNK_MAX = 1024*1024; // Hard upper limit (1M)

Matrix chunksize = dset_shape;
int ndims = chunksize.length ();
for (int i = 0; i < ndims; i++)
{
//For unlimited dimensions we have to guess 1024
if(chunksize(i) == octave_Inf || chunksize(i) == 0)
chunksize(i) = 1024;
}
// Determine the optimal chunk size in bytes using a PyTables expression.
// This is kept as a float.
int dset_size = chunksize.prod ()(0)*typesize;
int target_size = CHUNK_BASE * pow(2,log10(dset_size/(1024.0 * 1024)));
if (target_size > CHUNK_MAX)
target_size = CHUNK_MAX;
else if (target_size < CHUNK_MIN)
target_size = CHUNK_MIN;

int idx = 0;
while(true)
{

// Repeatedly loop over the axes, dividing them by 2. Stop when:
// 1a. We're smaller than the target chunk size, OR
// 1b. We're within 50% of the target chunk size, AND
// 2. The chunk is smaller than the maximum chunk size
int chunk_bytes = chunksize.prod ()(0)*typesize;
if ((chunk_bytes < target_size ||
abs(chunk_bytes-target_size)/target_size < 0.5) &&
chunk_bytes < CHUNK_MAX)
break;

if (chunksize.prod ()(0) == 1)
break; // Element size larger than CHUNK_MAX

chunksize(idx%ndims) = ceil(chunksize(idx%ndims) / 2.0);
idx++;
}
return chunksize;
}


#endif
6 changes: 5 additions & 1 deletion h5read.h
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,10 @@ class H5File
void write_att (const char *location, const char *attname,
const octave_value& attvalue);
void create_dset (const char *location, const Matrix& size,
const char *datatype, const Matrix& chunksize);
const char *datatype, Matrix& chunksize);
void delete_link (const char *location);
void delete_att (const char *location, const char *att_name);

private:
const static int ALLOC_HSIZE_INFZERO_TO_UNLIMITED = 1;
const static int ALLOC_HSIZE_INF_TO_ZERO = 2;
Expand All @@ -86,6 +89,7 @@ class H5File

int open_dset (const char *dsetname);
octave_value read_dset ();
Matrix get_auto_chunksize (const Matrix& size, int typesize);

template <typename T> hsize_t* alloc_hsize (const T& dim, const int mode, const bool reverse);

Expand Down
1 change: 1 addition & 0 deletions package/PKG_ADD
Original file line number Diff line number Diff line change
Expand Up @@ -3,3 +3,4 @@ autoload("h5readatt","h5read.oct")
autoload("h5write","h5read.oct")
autoload("h5writeatt","h5read.oct")
autoload("h5create","h5read.oct")
autoload("h5delete","h5read.oct")
2 changes: 1 addition & 1 deletion test/h5test.m
Original file line number Diff line number Diff line change
Expand Up @@ -260,4 +260,4 @@ function check_att(location, att)
disp(["error catched: ", lasterror.message])
end


h5create("test.h5","/created_autchunk",[ Inf Inf 4], 'ChunkSize', 'auto')

0 comments on commit 364310f

Please sign in to comment.