Source code for h5pandas.group

"""Very thin overlay over h5py library."""

import numpy as np
from h5pandas.dataframe import dataset_to_dataframe, group_to_dataframe
from h5pandas import HDF5Dtype
import h5py
import warnings

try:
    from pandas import DataFrame, Index
except ModuleNotFoundError:
    DataFrame = type(None)
    Index = type(None)



[docs]
class Group(h5py.Group):
    """
    h5py Group that provides a DataFrame instead of dataset.

    See h5py documentation: https://docs.h5py.org/en/stable/high/group.html
    """

    def __init__(self, group_id, columns=None):
        """
        Transform an object into a Group that provides DataFrame instead of dataset.

        Parameters
        ----------
        group_id : TYPE
            DESCRIPTION.
        columns : TYPE, optional
            DESCRIPTION. The default is None.

        Returns
        -------
        None.

        """
        """"""
        if isinstance(group_id, h5py.File):
            id = group_id["/"]._id
        elif isinstance(group_id, (h5py.h5f.FileID, h5py.h5g.GroupID)):
            id = group_id
        elif isinstance(group_id, h5py.Group):
            id = group_id._id

        super().__init__(id)

    def __getitem__(self, *args, **kwargs):
        """Convert item into DataFrame before returning it."""
        item = super().__getitem__(*args, **kwargs)
        if isinstance(item, h5py.Group):
            try:
                return group_to_dataframe(item)
            except ValueError:
                return Group(item)
        elif isinstance(item, h5py.Dataset):
            try:
                return dataset_to_dataframe(item)
            except ValueError:
                return item
        print("Item type not managed")
        return item

    def __getattribute__(self, name):
        """Get DataFrame if possible."""
        item = super().__getattribute__(name)
        if isinstance(item, h5py.File):
            item.__class__ = File
        elif isinstance(item, h5py.Group):
            item = Group(item)
        return item


[docs]
    def create_dataset(
        self,
        name,
        shape=None,
        dtype=None,
        data=None,
        index: list | None | Index = None,
        columns: list[str] | None = None,
        metadata: dict = {},
        **kwargs,
    ):
        """
        Create a dataset.

        If columns is provided or if data is a DataFrame,
        the columns names are written as attribute of the dataset.
        If data is a DataFrame, its attributes (data.attrs) are saved into the
        dataset attributes so that they can be retrieve later with h5pandas.
        If metadata is provided, it is written inside the dataset attributes.
        If metadata as the same key as data.attrs, metadata will be written in the file.

        See h5py documentation: https://docs.h5py.org/en/stable/high/dataset.html

        Parameters
        ----------
        name: str
            Name of the dataset (absolute or relative).  Provide None to make
            an anonymous dataset.
        shape
            Dataset shape.  Use "()" for scalar datasets.  Required if "data"
            isn't provided.
        dtype
            Numpy dtype or string.  If omitted, dtype('f') will be used.
            Required if "data" isn't provided; otherwise, overrides data
            array's dtype.
        data
            Provide data to initialize the dataset.  If used, you can omit
            shape and dtype arguments.
        index: list, None or `pandas.Index`, optional
            Default=None.
            If not None, index will be written inside the HDF5 file and can be retrieve later with h5pandas.
        columns: list, optional
            names of the columns of the array to save, if any.
            If the array is a structured array and columns is none then structured names are used.
            Otherwise, if None, then nothing is written.
        metadata : dict, optional
            Additional metadata to save with the dataset attributes.


        Keyword-only arguments:

        Returns
        -------
        pandas.DataFrame
            The newly create DataFrame.

        """

        def add_attribute(dataset, attr_name, values):
            if isinstance(values, dict):
                warnings.warn(
                    "When saving dataframe into HDF5 file : attribute of type dict is not supported yet",
                    Warning,
                )
                return

            values = np.atleast_1d(values)
            if values.dtype == object:
                values = values.astype(type(values[0]))
            try:
                dataset.attrs[attr_name] = np.char.encode(values)
            except TypeError:
                dataset.attrs[attr_name] = values

        # preprocess of numpy structured arrays
        if isinstance(data, np.ndarray):
            if columns is None:
                columns = data.dtype.names
            if data.dtype.names is not None:
                # on destructure le numpy struct array si besoin
                from numpy.lib import recfunctions as rfn

                data = rfn.structured_to_unstructured(data)
        elif isinstance(data, DataFrame):
            # We look for properties inside the dataFrame
            metadata = data.attrs | metadata
            # We also look for attributes inside the series that are not in the dataset
            for label, serie in data.items():
                for key, value in serie.attrs.items():
                    if key not in metadata:
                        metadata[f"series_attr_{label}_{key}"] = value

            if columns is None:
                columns = list(data.columns)
            if index is None:
                index = data.index
            # In some cases we need to convert the dataframe
            # FIXME : H5Dtype need to inherit from pandas.NumpyDtype ?
            if isinstance(data.dtypes.iloc[0], HDF5Dtype):
                data = data.to_numpy(copy=False, dtype=data.dtypes.iloc[0].type)

        dataset = super().create_dataset(
            name, shape=shape, dtype=dtype, data=data, **kwargs
        )
        # Write columns name inside the dataFrame
        if columns is not None:
            add_attribute(dataset, "columns", columns)

        # Write index inside the dataFrame
        if index is not None:
            add_attribute(dataset, "index", index)

        # Write attributes inside the dataFrame
        for name, value in metadata.items():
            try:
                add_attribute(dataset, name, value)
            except Exception:
                print("Could not add {} metadata to h5file metadata".format(name))

        return dataset_to_dataframe(dataset, index=index, columns=columns)





[docs]
class File(h5py.File, Group):
    """
    h5py File that provides a DataFrame instead of dataset.

    See h5py documentation: https://docs.h5py.org/en/stable/high/file.html
    """

    def __getitem__(self, *args, **kwargs):
        """Getter of Group class."""
        return super(h5py.File, self).__getitem__(*args, **kwargs)