IO

datamate.io ¶

Module for handling file I/O operations for Directory objects.

This module provides functionality for reading, writing, and manipulating HDF5 files and directories.

ArrayFile ¶

Bases: Protocol

Protocol for single-array HDF5 file interface.

Attributes:

Name	Type	Description
`path`	`Path`	Path to the HDF5 file.
`shape`	`Tuple[int, ...]`	Shape of the array data.
`dtype`	`dtype`	NumPy dtype of the array data.

Source code in datamate/io.py

@runtime_checkable
class ArrayFile(Protocol):
    """Protocol for single-array HDF5 file interface.

    Attributes:
        path: Path to the HDF5 file.
        shape: Shape of the array data.
        dtype: NumPy dtype of the array data.
    """

    path: Path
    shape: Tuple[int, ...]
    dtype: np.dtype

    def __getitem__(self, key: Any) -> Any: ...
    def __len__(self) -> int: ...
    def __getattr__(self, key: str) -> Any: ...

H5Reader ¶

Bases: ArrayFile

Wrapper around HDF5 read operations with safe file handle management.

Ensures file handles are only open during access operations to prevent resource leaks.

Attributes:

Name	Type	Description
`path`		Path to the HDF5 file.
`shape`		Shape of the array data.
`dtype`		NumPy dtype of the array data.
`n_retries`		Number of retry attempts for file operations.

Source code in datamate/io.py

class H5Reader(ArrayFile):
    """Wrapper around HDF5 read operations with safe file handle management.

    Ensures file handles are only open during access operations to prevent resource leaks.

    Attributes:
        path: Path to the HDF5 file.
        shape: Shape of the array data.
        dtype: NumPy dtype of the array data.
        n_retries: Number of retry attempts for file operations.
    """

    def __init__(
        self, path: Path, assert_swmr: bool = True, n_retries: int = 10
    ) -> None:
        self.path = Path(path)
        with h5.File(self.path, mode="r", libver="latest", swmr=True) as f:
            if assert_swmr:
                assert f.swmr_mode, "File is not in SWMR mode."
            assert "data" in f
            self.shape = f["data"].shape
            self.dtype = f["data"].dtype
        self.n_retries = n_retries

    def __getitem__(self, key):
        for retry_count in range(self.n_retries):
            try:
                with h5.File(self.path, mode="r", libver="latest", swmr=True) as f:
                    data = f["data"][key]
                break
            except Exception as e:
                if retry_count == self.n_retries - 1:
                    raise e
                sleep(0.1)
        return data

    def __len__(self):
        return self.shape[0]

    def __getattr__(self, key):
        # get attribute from underlying h5.Dataset object
        for retry_count in range(self.n_retries):
            try:
                with h5.File(self.path, mode="r", libver="latest", swmr=True) as f:
                    value = getattr(f["data"], key, None)
                break
            except Exception as e:
                if retry_count == self.n_retries - 1:
                    raise e
                sleep(0.1)
        if value is None:
            raise AttributeError(f"Attribute {key} not found.")
        # wrap callable attributes to open file before calling function
        if callable(value):

            def safe_wrapper(*args, **kwargs):
                # not trying `n_retries` times here, just for simplicity
                with h5.File(self.path, mode="r", libver="latest", swmr=True) as f:
                    output = getattr(f["data"], key)(*args, **kwargs)
                return output

            return safe_wrapper
        # otherwise just return value
        else:
            return value

directory_to_df ¶

directory_to_df(directory, dtypes=None)

Convert a directory to a pandas DataFrame.

Creates a DataFrame from HDF5 datasets in the directory. Single-element datasets are broadcast to match the most common length.

Parameters:

Name	Type	Description	Default
`directory`	`Directory`	Directory object containing HDF5 datasets.	required
`dtypes`	`Optional[Dict[str, dtype]]`	Optional mapping of column names to NumPy dtypes.	`None`

Returns:

Type	Description
`DataFrame`	DataFrame containing the directory data.

Example

dir = Directory("path/to/dir")
df = directory_to_df(dir, {"col1": np.float32})

Source code in datamate/io.py

def directory_to_df(
    directory: "Directory", dtypes: Optional[Dict[str, np.dtype]] = None
) -> DataFrame:
    """Convert a directory to a pandas DataFrame.

    Creates a DataFrame from HDF5 datasets in the directory. Single-element datasets
    are broadcast to match the most common length.

    Args:
        directory: Directory object containing HDF5 datasets.
        dtypes: Optional mapping of column names to NumPy dtypes.

    Returns:
        DataFrame containing the directory data.

    Example:
        ```python
        dir = Directory("path/to/dir")
        df = directory_to_df(dir, {"col1": np.float32})
        ```
    """
    from .utils import byte_to_str

    df_dict = {
        key: getattr(directory, key)[...]
        for key in list(directory.keys())
        if isinstance(getattr(directory, key), H5Reader)
    }

    # Get the lengths of all datasets.
    nelements = {k: len(v) or 1 for k, v in df_dict.items()}

    lengths, counts = np.unique([val for val in nelements.values()], return_counts=True)
    most_frequent_length = lengths[np.argmax(counts)]

    # If there are single element datasets, just create a new column of most_frequent_length and put the value in each row.
    if lengths.min() == 1:
        for k, v in nelements.items():
            if v == 1:
                df_dict[k] = df_dict[k].repeat(most_frequent_length)

    df_dict = byte_to_str(df_dict)

    if dtypes is not None:
        df_dict = {
            k: np.array(v).astype(dtypes[k]) for k, v in df_dict.items() if k in dtypes
        }
    return DataFrame.from_dict(
        {k: v.tolist() if v.ndim > 1 else v for k, v in df_dict.items()}
    )