Source code for h5pandas.group

"""Very thin overlay over h5py library."""

import numpy as np
from h5pandas.dataframe import dataset_to_dataframe, group_to_dataframe
from h5pandas import HDF5Dtype
import h5py
import warnings

try:
    from pandas import DataFrame, Index
except ModuleNotFoundError:
    DataFrame = type(None)
    Index = type(None)


[docs] class Group(h5py.Group): """ h5py Group that provides a DataFrame instead of dataset. See h5py documentation: https://docs.h5py.org/en/stable/high/group.html """ def __init__(self, group_id, columns=None): """ Transform an object into a Group that provides DataFrame instead of dataset. Parameters ---------- group_id : TYPE DESCRIPTION. columns : TYPE, optional DESCRIPTION. The default is None. Returns ------- None. """ """""" if isinstance(group_id, h5py.File): id = group_id["/"]._id elif isinstance(group_id, (h5py.h5f.FileID, h5py.h5g.GroupID)): id = group_id elif isinstance(group_id, h5py.Group): id = group_id._id super().__init__(id) def __getitem__(self, *args, **kwargs): """Convert item into DataFrame before returning it.""" item = super().__getitem__(*args, **kwargs) if isinstance(item, h5py.Group): try: return group_to_dataframe(item) except ValueError: return Group(item) elif isinstance(item, h5py.Dataset): try: return dataset_to_dataframe(item) except ValueError: return item print("Item type not managed") return item def __getattribute__(self, name): """Get DataFrame if possible.""" item = super().__getattribute__(name) if isinstance(item, h5py.File): item.__class__ = File elif isinstance(item, h5py.Group): item = Group(item) return item
[docs] def create_dataset( self, name, shape=None, dtype=None, data=None, index: list | None | Index = None, columns: list[str] | None = None, metadata: dict = {}, **kwargs, ): """ Create a dataset. If columns is provided or if data is a DataFrame, the columns names are written as attribute of the dataset. If data is a DataFrame, its attributes (data.attrs) are saved into the dataset attributes so that they can be retrieve later with h5pandas. If metadata is provided, it is written inside the dataset attributes. If metadata as the same key as data.attrs, metadata will be written in the file. See h5py documentation: https://docs.h5py.org/en/stable/high/dataset.html Parameters ---------- name: str Name of the dataset (absolute or relative). Provide None to make an anonymous dataset. shape Dataset shape. Use "()" for scalar datasets. Required if "data" isn't provided. dtype Numpy dtype or string. If omitted, dtype('f') will be used. Required if "data" isn't provided; otherwise, overrides data array's dtype. data Provide data to initialize the dataset. If used, you can omit shape and dtype arguments. index: list, None or `pandas.Index`, optional Default=None. If not None, index will be written inside the HDF5 file and can be retrieve later with h5pandas. columns: list, optional names of the columns of the array to save, if any. If the array is a structured array and columns is none then structured names are used. Otherwise, if None, then nothing is written. metadata : dict, optional Additional metadata to save with the dataset attributes. Keyword-only arguments: Returns ------- pandas.DataFrame The newly create DataFrame. """ def add_attribute(dataset, attr_name, values): if isinstance(values, dict): warnings.warn( "When saving dataframe into HDF5 file : attribute of type dict is not supported yet", Warning, ) return values = np.atleast_1d(values) if values.dtype == object: values = values.astype(type(values[0])) try: dataset.attrs[attr_name] = np.char.encode(values) except TypeError: dataset.attrs[attr_name] = values # preprocess of numpy structured arrays if isinstance(data, np.ndarray): if columns is None: columns = data.dtype.names if data.dtype.names is not None: # on destructure le numpy struct array si besoin from numpy.lib import recfunctions as rfn data = rfn.structured_to_unstructured(data) elif isinstance(data, DataFrame): # We look for properties inside the dataFrame metadata = data.attrs | metadata # We also look for attributes inside the series that are not in the dataset for label, serie in data.items(): for key, value in serie.attrs.items(): if key not in metadata: metadata[f"series_attr_{label}_{key}"] = value if columns is None: columns = list(data.columns) if index is None: index = data.index # In some cases we need to convert the dataframe # FIXME : H5Dtype need to inherit from pandas.NumpyDtype ? if isinstance(data.dtypes.iloc[0], HDF5Dtype): data = data.to_numpy(copy=False, dtype=data.dtypes.iloc[0].type) dataset = super().create_dataset( name, shape=shape, dtype=dtype, data=data, **kwargs ) # Write columns name inside the dataFrame if columns is not None: add_attribute(dataset, "columns", columns) # Write index inside the dataFrame if index is not None: add_attribute(dataset, "index", index) # Write attributes inside the dataFrame for name, value in metadata.items(): try: add_attribute(dataset, name, value) except Exception: print("Could not add {} metadata to h5file metadata".format(name)) return dataset_to_dataframe(dataset, index=index, columns=columns)
[docs] class File(h5py.File, Group): """ h5py File that provides a DataFrame instead of dataset. See h5py documentation: https://docs.h5py.org/en/stable/high/file.html """ def __getitem__(self, *args, **kwargs): """Getter of Group class.""" return super(h5py.File, self).__getitem__(*args, **kwargs)