Source code for pyActLearn.CASAS.h5py

import h5py
import logging
import dateutil.parser
import numpy as np
from collections import OrderedDict

logger = logging.getLogger(__name__)


[docs]class CASASHDF5: """CASASHDF5 Class to create and retrieve CASAS smart home data from h5df file The data saved to or retrieved from a H5PY data file are pre-calculated features by :class:`CASASData` class. The H5PY data file also contains meta-data about the dataset, which include description for each feature, splits by week and/or splits by days. Attributes: _file (:class:`h5py.File`): :class:`h5py.File` object that represents root group. Args: filename (:obj:`str`): HDF5 File Name mode (:obj:`str`): 'r' for load from the file, and 'w' for create a new h5py data """ def __init__(self, filename, mode='r', driver=None): self._file = h5py.File(filename, mode=mode, driver=driver) if mode == 'w': self._sources = [] self._weeks = OrderedDict() self._days = OrderedDict() self._feature_description = [] self._target_description = [] self._target_colors = [] self._sensors = [] self._comment = '' self._bg_target = '' elif mode == 'r': self._load_dataset_info() else: raise ValueError('mode should be \'w\' or \'r\', but got %s.' % mode)
[docs] def fetch_data(self, start_split=None, stop_split=None, pre_load=0): """Fetch data between start and stop splits Args: start_split (:obj:`str`): Begin of data stop_split (:obj:`str`): End of data pre_load (:obj:`int`): Load extra number of data before start split. Returns: :obj:`tuple` of :obj:`numpy.ndarray`: Returns a tuple of all sources sliced by the split defined. The sources should be in the order of ('time', 'feature', 'target') """ start, stop = self._get_split_range(start_split, stop_split, pre_load) # Get time into a array of datetime if 'time' in self._sources: time_list = [dateutil.parser.parse(date_string.decode('utf-8')) for date_string in self._file['time'][start:stop]] else: time_list = None # Get feature array if 'features' in self._sources: features = self._file['features'][start:stop] else: features = None # Get label array if 'targets' in self._sources: targets = self._file['targets'][start:stop] else: targets = None return time_list, features, targets
# region Metadata Auxiliary Functions
[docs] def num_sensors(self): """Return the number of sensors in the sensor list """ return len(self._sensors)
[docs] def get_sensor_by_index(self, i): """Get sensor name by index Args: i (:obj:`int`): Index to sensor """ return self._sensors[i]
[docs] def num_features(self): """Get number of features in the dataset """ return len(self._feature_description)
[docs] def get_feature_description_by_index(self, i): """Get the description of feature column :math:`i`. Args: i (:obj:`int`): Column index. Returns: :obj:`str`: Corresponding column description. """ return self._feature_description[i]
[docs] def num_targets(self): """Total number of target classes. Returns: :obj:`int`: Total number of target classes. """ return len(self._target_description)
[docs] def get_target_descriptions(self): """Get list of target descriptions Returns: :obj:`list` of :obj:`str`: List of target class description strings. """ return self._target_description
[docs] def get_target_description_by_index(self, i): """Get target description by class index :math:`i`. Args: i (:obj:`int`): Class index. Returns: :obj:`str`: Corresponding target class description. """ return self._target_description[i]
def get_target_colors(self): return self._target_colors
[docs] def get_target_color_by_index(self, i): """Get the color string of target class :math:`i`. Args: i (:obj:`int`): Class index. Returns: :obj:`str`: Corresponding target class color string. """ return self._target_colors[i]
[docs] def is_bg_target(self, i=None, label=None): """Check if the target class given by :param:`i` or :param:`label` is considered background Args: i (:obj:`int`): Class index. label (:obj:`str`): Class name. Returns: :obj:`bool`: True if it is considered background. """ if i is not None: return i == self._target_description.index(self._bg_target) if label is not None: return label == self._bg_target return False
[docs] def get_bg_target(self): """Get the description of the target class considered background in the dataset. Returns: :obj:`str`: Name of the class which is considered background in the dataset. Usually it is 'Other_Activity'. """ return self._bg_target
[docs] def get_bg_target_id(self): """Get the id of the target class considered background. Returns: :obj:`int`: The index of the target class which is considered background in the dataset. """ return self._target_description.index(self._bg_target)
[docs] def num_between_splits(self, start_split=None, stop_split=None): """Get the number of item between splits Args: start_split (:obj:`str`): Begin of data stop_split (:obj:`str`): End of data Returns: :obj:`int`: The number of items between two splits. """ start, stop = self._get_split_range(start_split, stop_split) return stop - start
[docs] def get_weeks_info(self): """Get splits by week. Returns: :obj:`List` of :obj:`tuple`: List of (key, value) tuple, where key is the name of the split and value is number of items in that split. """ return [(week, self._weeks[week][1] - self._weeks[week][0]) for week in self._weeks]
[docs] def get_days_info(self): """Get splits by day. Returns: :obj:`List` of :obj:`tuple`: List of (key, value) tuple, where key is the name of the split and value is number of items in that split. """ return [(day, self._days[day][1] - self._days[day][0]) for day in self._days]
# endregion # region CASASH5PY Dataset Creation
[docs] def create_features(self, feature_array, feature_description): """ Create Feature Dataset Args: feature_array (:obj:`numpy.ndarray`): Numpy array holding calculated feature vectors feature_description (:obj:`list` of :obj:`str`): List of strings that describe each column of feature vectors. """ if 'features' in self._sources: logger.error('Feature array already exists in the dataset.') return self._sources.append('features') self._feature_description = feature_description # Create feature array dset = self._file.create_dataset('features', data=feature_array, chuncks=True, compression="gzip", compression_opts=9) dset.dims[0].label = 'batch' dset.dims[1].label = 'feature' # Add Feature Description as attributes self._file.attrs['features'] = [description.encode('utf-8') for description in feature_description]
[docs] def create_targets(self, target_array, target_description, target_colors): """ Create Target Dataset Args: target_array (:obj:`numpy.ndarray`): Numpy array holding target labels target_description (:obj:`list` of :obj:`str`): List of strings that describe each each target class. target_colors (:obj:`list` of :obj:`str`): List of color values corresponding to each target class. """ if 'targets' in self._sources: logger.error('Target array already exists in the dataset.') return self._sources.append('targets') self._target_description = target_description self._target_colors = target_colors # Create feature array dset = self._file.create_dataset('targets', data=target_array.reshape((target_array.size, 1))) dset.dims[0].label = 'batch' dset.dims[1].label = 'target' # Add Target Description as attributes self._file.attrs['targets'] = [description.encode('utf-8') for description in target_description] # Add Target Color as attributes self._file.attrs['target_colors'] = [color_string.encode('utf-8') for color_string in target_colors]
[docs] def create_time_list(self, time_array): """ Create Time List Args: time_array (:obj:`list` of :obj:`datetime`): datetime corresponding to each feature vector in feature dataset. """ if 'time' in self._sources: logger.error('Time list already exists in the dataset.') return self._sources.append('time') # Create Time lists num_items = len(time_array) dt = h5py.special_dtype(vlen=bytes) dset = self._file.create_dataset('time', (num_items,), dtype=dt) for i in range(num_items): dset[i] = time_array[i].isoformat().encode('utf-8')
[docs] def create_splits(self, days, weeks): """ Create splits by days and weeks Args: days (:obj:`list` of :obj:`int`): Start index for each day weeks (:obj:`list` of :obj:`int`): Start index for week """ if len(self._days) != 0 or len(self._weeks) != 0: logger.error('Splits already exist.') return self._days = OrderedDict() self._weeks = OrderedDict() max_name_len = len('week_%d' % len(self._days)) # Create days numpy array days_array = np.empty( len(days) - 1, dtype=np.dtype([ ('name', 'a', max_name_len), ('start', np.int64, 1), ('stop', np.int64, 1)] )) # Create days numpy array weeks_array = np.empty( len(weeks) - 1, dtype=np.dtype([ ('name', 'a', max_name_len), ('start', np.int64, 1), ('stop', np.int64, 1)] )) # Populate days_array for i in range(len(days) - 1): days_array[i]['name'] = ('day_%d' % i).encode('utf-8') days_array[i]['start'] = days[i] days_array[i]['stop'] = days[i+1] days[('day_%d' % i)] = [days[i], days[i+1]] # Populate weeks array for i in range(len(weeks) - 1): weeks_array[i]['name'] = ('week_%d' % i).encode('utf-8') weeks_array[i]['start'] = weeks[i] weeks_array[i]['stop'] = weeks[i+1] weeks[('week_%d' % i)] = [weeks[i], weeks[i+1]] # Set attributes self._file.attrs['days'] = days_array self._file.attrs['weeks'] = weeks_array
[docs] def create_comments(self, comment): """ Add comments to dataset Args: comment (:obj:`str`): Comments to the dataset """ self._file.attrs['comment'] = comment.encode('utf-8')
[docs] def create_sensors(self, sensors): """ Add sensors list to attributes If the sensor IDs in the dataset is not binary coded, there is a need to provide the sensor list to go along with the feature vectors. Args: sensors (:obj:`list` of :obj:`str`): List of sensor name corresponds to the id in the feature array. """ self._file.attrs['sensors'] = [sensor.encode('utf-8') for sensor in sensors]
[docs] def set_background_target(self, target_name): """ Set 'target_name' as background target Args: target_name (:obj:`str`): Name of background target """ if self._bg_target != '': logger.error('background target label has been set to %s.' % self._bg_target) return self._bg_target = target_name self._file.attrs['bg_target'] = target_name.encode('utf-8')
[docs] def flush(self): """ Write To File """ self._file.attrs['sources'] = [source.encode('utf-8') for source in self._sources] self._file.flush()
# endregion
[docs] def close(self): """ Close Dataset """ self._file.close()
# region InternalSupportRoutines def _get_split_range(self, start_split=None, stop_split=None, pre_load=0): """Get the requested splits range Args: start_split (:obj:`str`): Begin of data stop_split (:obj:`str`): End of data pre_load (:obj:`int`): Load extra number of data before start split. Returns: :obj:`tuple` of :obj:`int`: Returns a tuple of the start and stop index. """ # Determine the start index if start_split is None: start = 0 stop = self._file[self._sources[0]].shape[0] elif start_split in self._weeks: start = self._weeks[start_split][0] stop = self._weeks[start_split][1] elif start_split in self._days: start = self._days[start_split][0] stop = self._days[start_split][1] else: raise ValueError('start_split error: Cannot find %s in splitting array.' % start_split) # Determine the stop index if stop_split is not None: if stop_split in self._weeks: stop = self._weeks[stop_split][1] elif stop_split in self._days: stop = self._weeks[stop_split][1] else: raise ValueError('stop_split error: Cannot find %s in splitting array.' % stop_split) # Compensate pre-load start = start - pre_load if start < 0: start = 0 return start, stop def _load_dataset_info(self): """Populate attributes of current class based on meta-data from h5py file """ attrs = self._file.attrs.keys() # Check sources set if 'sources' in attrs: self._sources = [source.decode('utf-8') for source in self._file.attrs['sources']] else: self._sources = [] # Parse splits self._weeks = OrderedDict() self._days = OrderedDict() if 'weeks' in attrs and 'days' in attrs: for row in self._file.attrs['weeks']: self._weeks[row['name'].decode('utf-8')] = [row['start'], row['stop']] for row in self._file.attrs['days']: self._days[row['name'].decode('utf-8')] = [row['start'], row['stop']] # Meta-data about dataset if 'features' in attrs: self._feature_description = [description.decode('utf-8') for description in self._file.attrs['features']] else: self._feature_description = [] if 'targets' in attrs: self._target_description = [description.decode('utf-8') for description in self._file.attrs['targets']] else: self._target_description = [] if 'target_colors' in attrs: self._target_colors = [color_string.decode('utf-8') for color_string in self._file.attrs['target_colors']] else: self._target_colors = [] if 'sensors' in attrs: self._sensors = [sensor.decode('utf-8') for sensor in self._file.attrs['sensors']] else: self._sensors = [] # Load Comments and Background task if 'bg_target' in attrs: self._bg_target = self._file.attrs['bg_target'].decode('utf-8') else: self._bg_target = '' if 'comment' in attrs: self._comment = self._file.attrs['comment'].decode('utf-8') else: self._comment = ''
# endregion