Skip to content

IO

datamate.io

Module for handling file I/O operations for Directory objects.

This module provides functionality for reading, writing, and manipulating HDF5 files and directories.

ArrayFile

Bases: Protocol

Protocol for single-array HDF5 file interface.

Attributes:

Name Type Description
path Path

Path to the HDF5 file.

shape Tuple[int, ...]

Shape of the array data.

dtype dtype

NumPy dtype of the array data.

Source code in datamate/io.py
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
@runtime_checkable
class ArrayFile(Protocol):
    """Protocol for single-array HDF5 file interface.

    Attributes:
        path: Path to the HDF5 file.
        shape: Shape of the array data.
        dtype: NumPy dtype of the array data.
    """

    path: Path
    shape: Tuple[int, ...]
    dtype: np.dtype

    def __getitem__(self, key: Any) -> Any: ...
    def __len__(self) -> int: ...
    def __getattr__(self, key: str) -> Any: ...

H5Reader

Bases: ArrayFile

Wrapper around HDF5 read operations with safe file handle management.

Ensures file handles are only open during access operations to prevent resource leaks.

Attributes:

Name Type Description
path

Path to the HDF5 file.

shape

Shape of the array data.

dtype

NumPy dtype of the array data.

n_retries

Number of retry attempts for file operations.

Source code in datamate/io.py
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
class H5Reader(ArrayFile):
    """Wrapper around HDF5 read operations with safe file handle management.

    Ensures file handles are only open during access operations to prevent resource leaks.

    Attributes:
        path: Path to the HDF5 file.
        shape: Shape of the array data.
        dtype: NumPy dtype of the array data.
        n_retries: Number of retry attempts for file operations.
    """

    def __init__(
        self, path: Path, assert_swmr: bool = True, n_retries: int = 10
    ) -> None:
        self.path = Path(path)
        with h5.File(self.path, mode="r", libver="latest", swmr=True) as f:
            if assert_swmr:
                assert f.swmr_mode, "File is not in SWMR mode."
            assert "data" in f
            self.shape = f["data"].shape
            self.dtype = f["data"].dtype
        self.n_retries = n_retries

    def __getitem__(self, key):
        for retry_count in range(self.n_retries):
            try:
                with h5.File(self.path, mode="r", libver="latest", swmr=True) as f:
                    data = f["data"][key]
                break
            except Exception as e:
                if retry_count == self.n_retries - 1:
                    raise e
                sleep(0.1)
        return data

    def __len__(self):
        return self.shape[0]

    def __getattr__(self, key):
        # get attribute from underlying h5.Dataset object
        for retry_count in range(self.n_retries):
            try:
                with h5.File(self.path, mode="r", libver="latest", swmr=True) as f:
                    value = getattr(f["data"], key, None)
                break
            except Exception as e:
                if retry_count == self.n_retries - 1:
                    raise e
                sleep(0.1)
        if value is None:
            raise AttributeError(f"Attribute {key} not found.")
        # wrap callable attributes to open file before calling function
        if callable(value):

            def safe_wrapper(*args, **kwargs):
                # not trying `n_retries` times here, just for simplicity
                with h5.File(self.path, mode="r", libver="latest", swmr=True) as f:
                    output = getattr(f["data"], key)(*args, **kwargs)
                return output

            return safe_wrapper
        # otherwise just return value
        else:
            return value

directory_to_df

directory_to_df(directory, dtypes=None)

Convert a directory to a pandas DataFrame.

Creates a DataFrame from HDF5 datasets in the directory. Single-element datasets are broadcast to match the most common length.

Parameters:

Name Type Description Default
directory Directory

Directory object containing HDF5 datasets.

required
dtypes Optional[Dict[str, dtype]]

Optional mapping of column names to NumPy dtypes.

None

Returns:

Type Description
DataFrame

DataFrame containing the directory data.

Example
dir = Directory("path/to/dir")
df = directory_to_df(dir, {"col1": np.float32})
Source code in datamate/io.py
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
def directory_to_df(
    directory: "Directory", dtypes: Optional[Dict[str, np.dtype]] = None
) -> DataFrame:
    """Convert a directory to a pandas DataFrame.

    Creates a DataFrame from HDF5 datasets in the directory. Single-element datasets
    are broadcast to match the most common length.

    Args:
        directory: Directory object containing HDF5 datasets.
        dtypes: Optional mapping of column names to NumPy dtypes.

    Returns:
        DataFrame containing the directory data.

    Example:
        ```python
        dir = Directory("path/to/dir")
        df = directory_to_df(dir, {"col1": np.float32})
        ```
    """
    from .utils import byte_to_str

    df_dict = {
        key: getattr(directory, key)[...]
        for key in list(directory.keys())
        if isinstance(getattr(directory, key), H5Reader)
    }

    # Get the lengths of all datasets.
    nelements = {k: len(v) or 1 for k, v in df_dict.items()}

    lengths, counts = np.unique([val for val in nelements.values()], return_counts=True)
    most_frequent_length = lengths[np.argmax(counts)]

    # If there are single element datasets, just create a new column of most_frequent_length and put the value in each row.
    if lengths.min() == 1:
        for k, v in nelements.items():
            if v == 1:
                df_dict[k] = df_dict[k].repeat(most_frequent_length)

    df_dict = byte_to_str(df_dict)

    if dtypes is not None:
        df_dict = {
            k: np.array(v).astype(dtypes[k]) for k, v in df_dict.items() if k in dtypes
        }
    return DataFrame.from_dict(
        {k: v.tolist() if v.ndim > 1 else v for k, v in df_dict.items()}
    )