Source code for h5pandas.dataframe

"""Module related to dataFrames."""

import numpy as np
import pandas
from h5pandas.h5array import HDF5ExtensionArray
import h5py
import re

_pattern_attr_serie = re.compile("series_attr_(.*)_(.*)")



[docs]
def dataframe_to_hdf(
    dataframe: pandas.DataFrame,
    h5file: str | h5py.Group,
    dataset_name: str = "dataframe",
    index: list | None | pandas.Index = None,
    columns: list[str] | None = None,
    metadata: dict = {},
    *args,
    **kwargs,
) -> h5py.Dataset:
    """
    High-level function to write a DataFrame into a HDF5 file.

    Dataframe columns names (dataframe.columns) and attributes (dataframe.attrs)
    will be written inside the dataset attributes and can be retrieve later
    when accessing the file with h5pandas.

    Parameters
    ----------
    dataframe : pandas.DataFrame
        The dataframe to write.
    h5file : str or `h5py.File` or `h5py.Group`
        If it is a string : the name of the HDF5 file in which the dataframe will be written.
        If the file already exist then the dataframe is added to this file.
        Otherwise the file is created.
        If hdf5file is a `h5py.File` or `h5py.Group` object then it will be written inside this object.
    dataset_name : str, optional
        The name of the dataset that will contain the dataframe. Default = "dataframe".
    index: list, None or `pandas.Index`, optional
        Default=None.
        If not None, index will be written inside the HDF5 file and can be retrieve later with h5pandas.
    columns: list, optional
        names of the columns of the dataframe to save, if any.
        If columns is none then the dataframe names are used.
        Otherwise, if None, then nothing is written.
    metadata : dict, optional
        Additional metadata to save with the dataframe as dataset attributes. Units or description for example.
    *args and **kwargs : additionnal parameters passed directly to h5py.create_dataset
        It can be compression options for example.
        See https://docs.h5py.org/en/stable/high/group.html#h5py.Group.create_dataset
        and https://pypi.org/project/hdf5plugin/

    Returns
    -------
    dataset : h5py.Dataset or None
        The dataset created inside h5file.
        If h5file is a string, returns None.
    """
    return _data_to_hf5(
        dataframe,
        h5file=h5file,
        dataset_name=dataset_name,
        index=index,
        columns=columns,
        metadata=metadata,
        *args,
        **kwargs,
    )




[docs]
def ndarray_to_hdf(
    array: np.ndarray,
    h5file: str | h5py.Group,
    dataset_name: str = "array",
    index: list | None | pandas.Index = None,
    columns: list[str] | None = None,
    metadata: dict = {},
    *args,
    **kwargs,
) -> h5py.Dataset:
    """
    High-level function to write a NumpyArray into a HDF5 file.

    Parameters
    ----------
    array : np.ndarray
        The array to write.
    h5file : str or `h5py.File` or `h5py.Group`
        If it is a string : the name of the HDF5 file in which the array will be written.
        If the file already exist then the array is added to this file.
        Otherwise the file is created.
        If hdf5file is a `h5py.File` or `h5py.Group` object then it will be written inside this object.
    dataset_name : str, optional
        The name of the dataset that will contain the array. Default = "array".
    index: list, None or `pandas.Index`, optional
        Default=None.
        If not None, index will be written inside the HDF5 file and can be retrieve later with h5pandas.
    columns: list, optional
        names of the columns of the array to save, if any.
        If the array is a structured array and columns is none then structured names are used.
        Otherwise, if None, then nothing is written.
    metadata : dict, optional
        Additional metadata to save with the array as dataset attributes. Units or description for example.
    *args and **kwargs : additionnal parameters passed directly to h5py.create_dataset
        It can be compression options for example.
        See https://docs.h5py.org/en/stable/high/group.html#h5py.Group.create_dataset
        and https://pypi.org/project/hdf5plugin/

    Returns
    -------
    dataset : h5py.Dataset or None
        The dataset created inside h5file.
        If h5file is a string, returns None.
    """
    return _data_to_hf5(
        array,
        h5file=h5file,
        dataset_name=dataset_name,
        index=index,
        columns=columns,
        metadata=metadata,
        *args,
        **kwargs,
    )



def _data_to_hf5(
    array,
    h5file: str | h5py.Group,
    dataset_name: str = "dataframe",
    index: list | None | pandas.Index = None,
    columns: list[str] | None = None,
    metadata: dict = {},
    *args,
    **kwargs,
) -> h5py.Dataset:
    from h5pandas.group import File, Group

    h5file_is_string = isinstance(h5file, str)
    if h5file_is_string:
        h5file = File(h5file, "a", libver=("v110", "latest"))
    elif isinstance(h5file, h5py.Group) and not isinstance(h5file, Group):
        h5file = Group(h5file)
    elif not isinstance(h5file, h5py.Group):
        TypeError("h5file must be either a str, a h5py.Group or h5py.File.")

    if dataset_name in h5file:
        del h5file[dataset_name]

    # select default parameters for optimised dataframe writting
    if "chunks" not in kwargs:
        kwargs["chunks"] = (array.shape[0], 1)

    if "maxshape" not in kwargs:
        kwargs["maxshape"] = [None] * len(array.shape)

    dataframe = h5file.create_dataset(
        dataset_name,
        data=array,
        index=index,
        columns=columns,
        metadata=metadata,
        *args,
        **kwargs,
    )

    if h5file_is_string:
        h5file.close()
        return
    else:
        return dataframe.h5.dataset



[docs]
def dataset_to_dataframe(dataset: h5py.Dataset, columns=None, index=None, copy=False):
    """
    Transform a dataset into a DataFrame.

    Parameters
    ----------
    dataset : h5py.Dataset
        The dataset to convert into a DataFrame.
    columns : iterable, optional
        Column labels to use for resulting frame when data does not have them,
        defaulting to RangeIndex(0, 1, 2, ..., n).
        If data contains column labels, will perform column selection instead.
    index : Index or array-like, optional
        Index to use for resulting frame. Will default to RangeIndex if
        no indexing information part of input data and no index provided.
    copy : bool, optional
        Copy data from inputs.
        For dict data, the default of None behaves like ``copy=True``.
        For DataFrame or 2d ndarray input, the default of None behaves like
        ``copy=False``.
        If data is a dict containing one or more Series (possibly of different
        dtypes),
        ``copy=False`` will ensure that these inputs are not copied.

    Returns
    -------
    pandas.DataFrame
        A dataFrame backed by the dataset.
        If you change the dataset values, the DataFrame will be changed.

    """
    # if no columns we try to find columns or we construct a tuple of None
    if columns is None:
        if "columns" in dataset.attrs:
            try:
                columns = tuple(np.char.decode(dataset.attrs["columns"]))
            except TypeError:
                columns = dataset.attrs["columns"]
        else:
            columns = (None,) * dataset.shape[1]

    nb_columns = len(columns)
    columns_decoded = [None] * nb_columns
    for i, col in enumerate(columns):
        if isinstance(col, (bytes, np.bytes_)):
            columns_decoded[i] = col.decode()
        elif col is None:
            columns_decoded[i] = i
        else:
            columns_decoded[i] = col

    if index is None:
        if "index" in dataset.attrs:
            try:
                index = tuple(np.char.decode(dataset.attrs["index"]))
            except TypeError:
                index = dataset.attrs["index"]

    # We use a manager to speed up the dataFrame creation 0.8s -> 0.2s
    arrays = [HDF5ExtensionArray(dataset, i) for i, col in enumerate(columns_decoded)]
    from pandas.core.internals.construction import arrays_to_mgr

    mgr = arrays_to_mgr(
        arrays,
        columns_decoded,
        index,
        dtype=None,
        typ="block",
        consolidate=copy,
    )
    dataframe = pandas.DataFrame._from_mgr(mgr, axes=[columns_decoded, index])

    # Old method : maybe safer ? but much slower
    # series = [
    #     pandas.Series(HDF5ExtensionArray(dataset, i), index=index, name=col, copy=False)
    #     for i, col in enumerate(columns_decoded)
    # ]
    # dataframe = pandas.concat(series, copy=copy, axis=1)

    # copy the dataset attrs into the dataframe attrs
    for key, value in dataset.attrs.items():
        if key in ("columns", "index"):
            continue
        if isinstance(value, (bytes, np.bytes_)):
            value = value.decode()
        try:
            value = np.char.decode(value)
        except (AttributeError, TypeError):
            pass

        # special case for Series Attributes
        if m := _pattern_attr_serie.match(key):
            dataframe[m[1]].attrs[m[2]] = value
        else:
            dataframe.attrs[key] = value
    return dataframe




[docs]
def group_to_dataframe(group) -> pandas.DataFrame:
    """
    Transform a group into a DataFrame.

    Parameters
    ----------
    group : h5py.group
        The group to convert into a DataFrame.

    Returns
    -------
    pandas.DataFrame
        A dataFrame backed by the dataset.
        If you change the dataset values, the DataFrame will cbe changed.
    """
    # First option : the dataframe has been written by pandas (PyTables) with format = "fixed" or "table"
    if "pandas_type" in group.attrs:
        if group.attrs["pandas_type"] == b"frame":
            return _group_fixed_to_dataframe(group)
        elif group.attrs["pandas_type"] == b"frame_table":
            return _group_table_to_dataframe(group)

    # Second option : all the datasets have the same length, each is one is a serie
    try:
        return _group_with_column_to_dataframe(group)
    except ValueError:
        pass

    raise ValueError("Group could not be converted into a DataFrame")



def _group_with_column_to_dataframe(group) -> pandas.DataFrame:
    series = []
    for dataset_name in group:
        dataset = group[dataset_name]
        if not isinstance(dataset, h5py.Dataset):
            raise ValueError("All child of the group must be datasets")
        if "columns" in dataset.attrs:
            raise ValueError("This dataset contains several columns")
        series.append(
            pandas.Series(HDF5ExtensionArray(dataset), name=dataset_name, copy=False)
        )

    # concatenate the series into a DataFrame
    return pandas.concat(series, axis=1)


def _group_fixed_to_dataframe(group) -> pandas.DataFrame:
    if pandas.api.types.is_string_dtype(group["axis0"].dtype):
        columns = np.char.decode(group["axis0"])
    else:
        columns = group["axis0"]
    if pandas.api.types.is_string_dtype(group["axis1"].dtype):
        index = np.char.decode(group["axis1"])
    else:
        index = group["axis1"]
    return dataset_to_dataframe(group["block0_values"], columns=columns, index=index)


def _group_table_to_dataframe(group) -> pandas.DataFrame:
    import warnings

    warnings.warn(
        "You should reconsider using h5pandas to open table dataset.",
        UserWarning,
    )
    raise NotImplementedError(
        "You should reconsider using h5pandas to open table dataset."
    )


try:
    # delete the accessor to avoid warning
    del pandas.DataFrame.h5
    del pandas.Series.h5
except AttributeError:
    pass



[docs]
@pandas.api.extensions.register_dataframe_accessor("h5")
@pandas.api.extensions.register_series_accessor("h5")
class DatasetAccessor:
    """Accessor to dataset for pandas object from h5pandas."""

    def __init__(self, pandas_obj):
        """
        Init the accessor of a Panda Series or DataFrame.

        Parameters
        ----------
        pandas_obj : pandas.Series or pandas.DataFrame

        """
        self._values = self._validate(pandas_obj)
        self._obj = pandas_obj

    @staticmethod
    def _validate(obj):
        """Verify the DataFrame is backed by opened h5file."""
        if isinstance(obj, pandas.DataFrame):
            values = obj[obj.columns[0]].values
        elif isinstance(obj, pandas.Series) and not hasattr(obj.values, "_datatset"):
            values = obj.values
        else:
            values = obj
        if hasattr(values, "_dataset"):
            return values
        else:
            raise AttributeError("Pandas Object must be backed by h5file.")

    @property
    def file(self):
        """Return the file backing the Pandas Object."""
        return self._values._dataset.file

    @property
    def dataset(self):
        """Return the dataset backing the Pandas Object."""
        return self._values._dataset

    @property
    def attrs(self):
        """Return the attributes of the dataset backing the Pandas Object."""
        return self._values._dataset.attrs

    @property
    def name(self):
        """Return the name of the dataset backing the Pandas Object."""
        return self._values._dataset.name