From a25762bb0ae33ec840b52c103d5c491144ee5ffb Mon Sep 17 00:00:00 2001 From: Adam Salvail Date: Mon, 10 Mar 2014 15:51:50 -0400 Subject: [PATCH 01/65] Skeleton of the Dataset abstract class. --- mldata/dataset.py | 44 ++++++++++++++++++++++++++++++++++---------- 1 file changed, 34 insertions(+), 10 deletions(-) diff --git a/mldata/dataset.py b/mldata/dataset.py index 914a3c1..67cf4da 100644 --- a/mldata/dataset.py +++ b/mldata/dataset.py @@ -1,17 +1,41 @@ -# -*- coding: utf-8 -*- +"""Datasets store the data used for experiments.""" -class Dataset(list): - info = {} +class Dataset(): + """The abstract superclass of every types of datasets used in MLData - def __init__(self, data=[]): - super(Dataset, self).__init__(data) + A `Dataset` presents a unified access to data, independent of the + implementation details such as laziness. + Parameters + ---------- + data : array_like + meta_data : MetaData -class LazyDataset(Dataset): - def __init__(self, lazy_functions): - super(LazyDataset, self).__init__() - self.lazy_functions = lazy_functions + Attributes + ---------- + data : array_like + The array of data to train on. + meta_data : Metadata + Information about the data. See `MetaData` documentation for more info. + """ + def __init__(self, data, meta_data): + self.data = data + assert isinstance(meta_data, Metadata) + self.meta_data = meta_data def __iter__(self): - return self.lazy_functions['__iter__']() + raise NotImplementedError + + def __getitem__(self, item): + raise NotImplementedError + + def get_splits(self): + pass + + def build(self): # Replace with constructor ? + pass + + def apply(self): + pass + From a6e925c2a8b125637fedf58bfcf894278849952d Mon Sep 17 00:00:00 2001 From: Adam Salvail Date: Mon, 10 Mar 2014 16:08:04 -0400 Subject: [PATCH 02/65] First draft of the metadata class. --- mldata/dataset.py | 32 ++++++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) diff --git a/mldata/dataset.py b/mldata/dataset.py index 67cf4da..f0cab9b 100644 --- a/mldata/dataset.py +++ b/mldata/dataset.py @@ -39,3 +39,35 @@ def build(self): # Replace with constructor ? def apply(self): pass + +class Metadata(): + """Keep track of information about a dataset. + + An instance of this class is required to build a `Dataset`. It gives + information on how the dataset is called, the split, etc. + + A single `Dataset` can have multiple metadata files specifying different + split or a special pre-processing that needs to be applied. The + philosophy is to have a single physical copy of the dataset with + different views that can be created on the fly as needed. + + Attributes + ---------- + name : str + The name of the `Dataset` + nb_examples : int + The number of example in the dataset (including all splits). + dictionary : dict + Gives a mapping of words (str) to id (int). Used only when the + dataset has been saved as an array of numbers instead of text. + splits : tuple of int + Specifies the split used by this view of the dataset. + preprocess : function or None + A function that is callable on a `Dataset` to preprocess the data. + """ + def __init__(self): + self.name = "Default" + self.nb_examples = 0 + self.dictionary = None + self.splits = () + self.preprocess = None \ No newline at end of file From f21cca6e5d5030d1f6c432c080fb33d1d580de5b Mon Sep 17 00:00:00 2001 From: Adam Salvail Date: Mon, 10 Mar 2014 16:18:53 -0400 Subject: [PATCH 03/65] First draft of an in-memory implementation of a dataset. --- mldata/dataset.py | 43 ++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 42 insertions(+), 1 deletion(-) diff --git a/mldata/dataset.py b/mldata/dataset.py index f0cab9b..7330bbc 100644 --- a/mldata/dataset.py +++ b/mldata/dataset.py @@ -70,4 +70,45 @@ def __init__(self): self.nb_examples = 0 self.dictionary = None self.splits = () - self.preprocess = None \ No newline at end of file + self.preprocess = None + + +class InMemoryDataset(Dataset): + """Build a dataset entirely contained in memory. + + Load the data (an array-like object) in memory. Random access is then + insured to be fast. + + Parameters + ---------- + examples : array_like + The dataset. + meta_data : Metadata + The metadata of this dataset. + targets : ? + + See Also + -------- + Dataset : The parent class defining the interface of a dataset. + + """ + def __init__(self, examples, meta_data, targets=None): + super(InMemoryDataset, self).__init__(meta_data) + + self.data = examples + + if targets is None: + self.__iter__ = self._iter_without_target + else: + self.targets = targets + self.__iter__ = self._iter_with_target + + + def __getitem__(self, item): + pass + + def _iter_with_target(self): + pass + + def _iter_without_target(self): + pass \ No newline at end of file From 225ddc751411dc89a365fe5eeace9248b91e5a6f Mon Sep 17 00:00:00 2001 From: Adam Salvail Date: Tue, 11 Mar 2014 16:16:05 -0400 Subject: [PATCH 04/65] Added a version number to datasets metadata --- mldata/dataset.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/mldata/dataset.py b/mldata/dataset.py index 7330bbc..5f11d48 100644 --- a/mldata/dataset.py +++ b/mldata/dataset.py @@ -64,6 +64,8 @@ class Metadata(): Specifies the split used by this view of the dataset. preprocess : function or None A function that is callable on a `Dataset` to preprocess the data. + version : int + The version number of the dataset that is required. """ def __init__(self): self.name = "Default" @@ -71,6 +73,7 @@ def __init__(self): self.dictionary = None self.splits = () self.preprocess = None + self.version = 0 class InMemoryDataset(Dataset): From 5197fad4e192cc180af1a4b41b51829f8f9b065c Mon Sep 17 00:00:00 2001 From: Adam Salvail Date: Tue, 11 Mar 2014 16:29:56 -0400 Subject: [PATCH 05/65] Added the Dictionary class stub. --- mldata/dataset.py | 25 +++++++++++++++++++++++-- 1 file changed, 23 insertions(+), 2 deletions(-) diff --git a/mldata/dataset.py b/mldata/dataset.py index 5f11d48..ed7ca40 100644 --- a/mldata/dataset.py +++ b/mldata/dataset.py @@ -57,7 +57,7 @@ class Metadata(): The name of the `Dataset` nb_examples : int The number of example in the dataset (including all splits). - dictionary : dict + dictionary : Dictionary Gives a mapping of words (str) to id (int). Used only when the dataset has been saved as an array of numbers instead of text. splits : tuple of int @@ -114,4 +114,25 @@ def _iter_with_target(self): pass def _iter_without_target(self): - pass \ No newline at end of file + pass + + +class Dictionary: + """Word / integer association list + + This dictionary is used in `Metadata` for NLP problems. This class + ensures O(1) conversion from id to word and O(log n) conversion from word to + id. + + Notes + ----- + The class is *not yet implemented*. + + Plans are for the dictionary to be implemented as a list of words + alphabetically ordered with the index of the word being its id. A method + implements a binary search over the words in order to retrieve its id. + """ + + def __init__(self): + raise NotImplementedError("The class Dictionary is not yet " + "implemented.") \ No newline at end of file From 27ef7800d889b277eb327946980d2f983cdcba46 Mon Sep 17 00:00:00 2001 From: Adam Salvail Date: Tue, 11 Mar 2014 16:43:54 -0400 Subject: [PATCH 06/65] Updated datasets definition to correctly handle targets datasets. --- mldata/dataset.py | 33 ++++++++++++++++++--------------- 1 file changed, 18 insertions(+), 15 deletions(-) diff --git a/mldata/dataset.py b/mldata/dataset.py index ed7ca40..7d09ce3 100644 --- a/mldata/dataset.py +++ b/mldata/dataset.py @@ -9,26 +9,31 @@ class Dataset(): Parameters ---------- + meta_data : Metadata data : array_like - meta_data : MetaData + target : array_like Attributes ---------- - data : array_like - The array of data to train on. meta_data : Metadata Information about the data. See `MetaData` documentation for more info. + data : array_like + The array of data to train on. + target : array_like, optional + The array of target to use for supervised learning. `target` should + be `None` when the dataset doesn't support supervised learning. """ - def __init__(self, data, meta_data): + def __init__(self, meta_data, data, target=None): self.data = data + self.target = target assert isinstance(meta_data, Metadata) self.meta_data = meta_data def __iter__(self): - raise NotImplementedError + raise NotImplementedError("Dataset is an abstract class.") def __getitem__(self, item): - raise NotImplementedError + raise NotImplementedError("Dataset is an abstract class.") def get_splits(self): pass @@ -84,26 +89,24 @@ class InMemoryDataset(Dataset): Parameters ---------- - examples : array_like - The dataset. meta_data : Metadata The metadata of this dataset. - targets : ? + examples : array_like + The dataset. + targets : array_like, optional + The targets used for the examples. If there is no target, `None` + should be used instead. See Also -------- Dataset : The parent class defining the interface of a dataset. - """ - def __init__(self, examples, meta_data, targets=None): - super(InMemoryDataset, self).__init__(meta_data) - - self.data = examples + def __init__(self, meta_data, examples, targets=None): + super(InMemoryDataset, self).__init__(meta_data, examples, targets) if targets is None: self.__iter__ = self._iter_without_target else: - self.targets = targets self.__iter__ = self._iter_with_target From 5ad4ee807880e91f0074e3b69dafc91bad055c7f Mon Sep 17 00:00:00 2001 From: Adam Salvail Date: Tue, 11 Mar 2014 17:21:36 -0400 Subject: [PATCH 07/65] Updated the datasets general methods concerning splits and applying preprocessing. --- mldata/dataset.py | 38 +++++++++++++++++++++++++++++++++----- 1 file changed, 33 insertions(+), 5 deletions(-) diff --git a/mldata/dataset.py b/mldata/dataset.py index 7d09ce3..2a3d436 100644 --- a/mldata/dataset.py +++ b/mldata/dataset.py @@ -35,14 +35,42 @@ def __iter__(self): def __getitem__(self, item): raise NotImplementedError("Dataset is an abstract class.") + def __len__(self): + return self.meta_data.nb_examples + def get_splits(self): - pass + """Return the splits defined by the associated metadata. - def build(self): # Replace with constructor ? - pass + The split is given via a tuple of integer with each integers + representing the integer after the last id used by this split. For + example:: + + (5000, 6000, 7000) + + would give a test set of all examples from 0 to 4999, a validation + set of examples 5000 to 5999 and a test set of examples 6000 up to + 6999. This means that 7000 is also the number of examples in the + dataset. + + Returns + ------- + tuple of int + Where each integer gives the id of the example coming after the + last one in a split. + """ + return self.meta_data.splits def apply(self): - pass + """Apply the preprocess specified in the associated metadata. + + This methods simply apply the function given in the metadata (the + identity by default) to the dataset. This function is supposed to do + work on the data and the targets, leaving the rest intact. Still, + as long as the result is still a `Dataset`, `apply` will work. + """ + ds = self.meta_data.preprocess(self) + assert isinstance(ds, Dataset) + self = ds class Metadata(): @@ -77,7 +105,7 @@ def __init__(self): self.nb_examples = 0 self.dictionary = None self.splits = () - self.preprocess = None + self.preprocess = lambda x: x self.version = 0 From 1c605d1dabbae8fb71b1cf651e0994a25dd5b8dd Mon Sep 17 00:00:00 2001 From: Adam Salvail Date: Wed, 12 Mar 2014 11:14:25 -0400 Subject: [PATCH 08/65] Added a check to make sure splits are defined via a tuple. --- mldata/dataset.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/mldata/dataset.py b/mldata/dataset.py index 2a3d436..18d942a 100644 --- a/mldata/dataset.py +++ b/mldata/dataset.py @@ -57,8 +57,16 @@ def get_splits(self): tuple of int Where each integer gives the id of the example coming after the last one in a split. + + Notes + ----- + For now, only a tuple is accepted. Eventually, predicates over the + examples id could be supported. """ - return self.meta_data.splits + if isinstance(self.meta_data.splits, tuple): + return self.meta_data.splits + else + raise NotImplementedError("Only splits with tuple are supported.") def apply(self): """Apply the preprocess specified in the associated metadata. From 2fb5d6b8b049abc8e18353c90dbf709299a0cca4 Mon Sep 17 00:00:00 2001 From: Adam Salvail Date: Wed, 12 Mar 2014 11:50:59 -0400 Subject: [PATCH 09/65] Deleted InMemoryDataset and transfered methods to Dataset The storage requirement will instead be controlled by the driver (hdf5). --- mldata/dataset.py | 65 +++++++++++++++-------------------------------- 1 file changed, 20 insertions(+), 45 deletions(-) diff --git a/mldata/dataset.py b/mldata/dataset.py index 18d942a..39b4bc3 100644 --- a/mldata/dataset.py +++ b/mldata/dataset.py @@ -2,7 +2,7 @@ class Dataset(): - """The abstract superclass of every types of datasets used in MLData + """Interface to interact with physical dataset A `Dataset` presents a unified access to data, independent of the implementation details such as laziness. @@ -29,15 +29,29 @@ def __init__(self, meta_data, data, target=None): assert isinstance(meta_data, Metadata) self.meta_data = meta_data - def __iter__(self): - raise NotImplementedError("Dataset is an abstract class.") - - def __getitem__(self, item): - raise NotImplementedError("Dataset is an abstract class.") + if targets is None: + self.__iter__ = self._iter_without_target + self.__getitem__ = self._get_with_target + else: + self.__iter__ = self._iter_with_target + self.__getitem__ = self._get_without_target def __len__(self): return self.meta_data.nb_examples + def _iter_with_target(self): + pass + + def _get_with_target(self, key): + pass + + def _iter_without_target(self): + pass + + def _get_without_target(self, key): + pass + + def get_splits(self): """Return the splits defined by the associated metadata. @@ -117,45 +131,6 @@ def __init__(self): self.version = 0 -class InMemoryDataset(Dataset): - """Build a dataset entirely contained in memory. - - Load the data (an array-like object) in memory. Random access is then - insured to be fast. - - Parameters - ---------- - meta_data : Metadata - The metadata of this dataset. - examples : array_like - The dataset. - targets : array_like, optional - The targets used for the examples. If there is no target, `None` - should be used instead. - - See Also - -------- - Dataset : The parent class defining the interface of a dataset. - """ - def __init__(self, meta_data, examples, targets=None): - super(InMemoryDataset, self).__init__(meta_data, examples, targets) - - if targets is None: - self.__iter__ = self._iter_without_target - else: - self.__iter__ = self._iter_with_target - - - def __getitem__(self, item): - pass - - def _iter_with_target(self): - pass - - def _iter_without_target(self): - pass - - class Dictionary: """Word / integer association list From 6d8ee8bfc6cfc0694eb847d57e2cc9f9b946ba7e Mon Sep 17 00:00:00 2001 From: Adam Salvail Date: Wed, 12 Mar 2014 12:16:23 -0400 Subject: [PATCH 10/65] Added definitions of iterator and get item supporting supervised and unsupervised learning. --- mldata/dataset.py | 68 +++++++++++++++++++++++++++++++++++++---------- 1 file changed, 54 insertions(+), 14 deletions(-) diff --git a/mldata/dataset.py b/mldata/dataset.py index 39b4bc3..4670fd9 100644 --- a/mldata/dataset.py +++ b/mldata/dataset.py @@ -22,6 +22,7 @@ class Dataset(): target : array_like, optional The array of target to use for supervised learning. `target` should be `None` when the dataset doesn't support supervised learning. + """ def __init__(self, meta_data, data, target=None): self.data = data @@ -39,19 +40,6 @@ def __init__(self, meta_data, data, target=None): def __len__(self): return self.meta_data.nb_examples - def _iter_with_target(self): - pass - - def _get_with_target(self, key): - pass - - def _iter_without_target(self): - pass - - def _get_without_target(self, key): - pass - - def get_splits(self): """Return the splits defined by the associated metadata. @@ -76,10 +64,11 @@ def get_splits(self): ----- For now, only a tuple is accepted. Eventually, predicates over the examples id could be supported. + """ if isinstance(self.meta_data.splits, tuple): return self.meta_data.splits - else + else: raise NotImplementedError("Only splits with tuple are supported.") def apply(self): @@ -89,11 +78,60 @@ def apply(self): identity by default) to the dataset. This function is supposed to do work on the data and the targets, leaving the rest intact. Still, as long as the result is still a `Dataset`, `apply` will work. + """ ds = self.meta_data.preprocess(self) assert isinstance(ds, Dataset) self = ds + def _iter_with_target(self): + """Provide an iterator when the Dataset has a target.""" + for (ex, tg) in zip(self.data, self.targets): + yield (ex, tg) + + def _get_with_target(self, key): + """Get the entry specified by the key. + + Parameters + ---------- + key : numpy-like key + The `key` can be a single integer, a slice or a tuple defining + coordinates. Can be treated as a NumPy key. + + Returns + ------- + (array_like, array_like) + Return the element specified by the key. It can be an array or + simply a scalar of the type defined by the data and target arrays. + The returned values are put in a tuple (data, target). + + """ + return (self.data[key], self.target[key]) + + def _iter_without_target(self): + """Provide an iterator when the Dataset has no target.""" + for ex in self.data: + yield (ex,) + + def _get_without_target(self, key): + """Get the entry specified by the key. + + Parameters + ---------- + key : numpy-like key + The `key` can be a single integer, a slice or a tuple defining + coordinates. Can be treated as a NumPy key. + + Returns + ------- + (array_like, ) + Return the element specified by the key. It can be an array or + simply a scalar of the type defined by the data and target arrays. + The returned values are put in a tuple (data, ). + + """ + return (self.data[key],) + class Metadata(): """Keep track of information about a dataset. @@ -121,6 +159,7 @@ class Metadata(): A function that is callable on a `Dataset` to preprocess the data. version : int The version number of the dataset that is required. + """ def __init__(self): self.name = "Default" @@ -145,6 +184,7 @@ class Dictionary: Plans are for the dictionary to be implemented as a list of words alphabetically ordered with the index of the word being its id. A method implements a binary search over the words in order to retrieve its id. + """ def __init__(self): From 35f112f15e6a87de4e6429b1d8fc2576c8939a27 Mon Sep 17 00:00:00 2001 From: Adam Salvail Date: Tue, 18 Mar 2014 17:23:42 -0400 Subject: [PATCH 11/65] utils to save and load the config file which contains the path to the datasets folders. --- mldata/utils/config.py | 38 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 38 insertions(+) create mode 100644 mldata/utils/config.py diff --git a/mldata/utils/config.py b/mldata/utils/config.py new file mode 100644 index 0000000..6b443f1 --- /dev/null +++ b/mldata/utils/config.py @@ -0,0 +1,38 @@ +import configparser +from os.path import expanduser + + +def create_default_config(): + """ Build and save a default config file for MLData. + + The default config is saved as ``.MLDataConfig`` in the ``$HOME`` folder + or its equivalent. The only thing present in the config file is a list of + path of folders where datasets are stored. + + """ + cp = configparser.ConfigParser() + cp['datasets'] = {'paths': '[' + expanduser("~")+'/.datasets' + ']'} + save(cp) + + +def save(config): + """ Save a config file in the default config file emplacement.""" + config.write(expanduser("~")+'.mldataConfig') + + +def load_paths(): + """ Load the config file at the default emplacement. + + Returns + ------- + [str] + A list of strings giving the paths to dataset folders. + + """ + cp = configparser.ConfigParser() + cp.read(expanduser("~")+'.mldataConfig') + l = cp['datasets']['paths'] + assert isinstance(eval(l), list), "The paths " + l + " is not a list." + assert all(isinstance(e, str) for e in eval(l)), "Elements of the list" +\ + l + " are not strings." + return eval(l) \ No newline at end of file From aaffb2fd0c5977d3ca44a80ec19022b72140a258 Mon Sep 17 00:00:00 2001 From: Adam Salvail Date: Wed, 26 Mar 2014 16:40:01 -0400 Subject: [PATCH 12/65] Changed the config file logic to support by dataset path. --- mldata/utils/config.py | 95 +++++++++++++++++++++++++++++++++--------- 1 file changed, 76 insertions(+), 19 deletions(-) diff --git a/mldata/utils/config.py b/mldata/utils/config.py index 6b443f1..5188f5e 100644 --- a/mldata/utils/config.py +++ b/mldata/utils/config.py @@ -1,38 +1,95 @@ import configparser +import os from os.path import expanduser +from shutil import rmtree -def create_default_config(): - """ Build and save a default config file for MLData. +CONFIGFILE = expanduser("~")+'.mldataConfig' - The default config is saved as ``.MLDataConfig`` in the ``$HOME`` folder - or its equivalent. The only thing present in the config file is a list of - path of folders where datasets are stored. +def add_dataset(dataset_name): + """ Add a dataset to the index.""" - """ - cp = configparser.ConfigParser() - cp['datasets'] = {'paths': '[' + expanduser("~")+'/.datasets' + ']'} - save(cp) + path = os.path.join(_load_path(), dataset_name) + + if not os.path.isdir(path): + os.mkdir(path) + + cp = _load_config() + cp['datasets'][dataset_name] = path + _save_config(cp) + +def remove_dataset(dataset_name): + """ Remove a dataset from the index.""" + path = os.path.join(_load_path(), dataset_name) + + if os.path.isdir(path): # Does path exist ? + rmtree(path, ignore_errors=True) + + cp = _load_config() + cp.remove_option('datasets', dataset_name) + _save_config(cp) + +def get_dataset_path(dataset_name): + """ Retreive the dataset path. + + Parameters + ---------- + dataset_name : str + Name of a dataset + + Returns + ------- + str + The string of the path where ``dataset_name`` is saved. + Raises + ------ + KeyError + If the path specified in the config file does not exist in the system. + """ + cp = _load_config() + path = cp['datasets'][dataset_name] + if not os.path.isdir(path): + raise KeyError("Wrong path in .mldataConfig.") + else: + return path -def save(config): +def _save_config(config): """ Save a config file in the default config file emplacement.""" config.write(expanduser("~")+'.mldataConfig') -def load_paths(): +def _load_config(): + """ Loads the configuration file for MLData.""" + if not os.path.exists(CONFIGFILE): + _create_default_config() + return configparser.ConfigParser().read(CONFIGFILE) + +def _create_default_config(): + """ Build and save a default config file for MLData. + + The default config is saved as ``.MLDataConfig`` in the ``$HOME`` folder + or its equivalent. It stores the emplacement of dataset files and make an + index of accessible datasets. + + """ + cp = configparser.ConfigParser() + cp['config'] = {'path': expanduser("~")+'/.datasets'} + cp['datasets'] = {} + _save_config(cp) + with open(CONFIGFILE, 'a') as f: + f.write("# Datasets path shouldn't be changed manually.") + +def _load_path(): """ Load the config file at the default emplacement. Returns ------- - [str] + str A list of strings giving the paths to dataset folders. """ - cp = configparser.ConfigParser() - cp.read(expanduser("~")+'.mldataConfig') - l = cp['datasets']['paths'] - assert isinstance(eval(l), list), "The paths " + l + " is not a list." - assert all(isinstance(e, str) for e in eval(l)), "Elements of the list" +\ - l + " are not strings." - return eval(l) \ No newline at end of file + cp = _load_config() + path = cp['config']['path'] + assert os.path.isdir(path), "Configured path is not a valid directory." + return path \ No newline at end of file From 7ef811c615de3aae4dc5abe423390c0bc4b77494 Mon Sep 17 00:00:00 2001 From: Adam Salvail Date: Wed, 26 Mar 2014 16:57:13 -0400 Subject: [PATCH 13/65] Added a check to see if a specific dataset path exists. --- mldata/utils/config.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/mldata/utils/config.py b/mldata/utils/config.py index 5188f5e..3156d76 100644 --- a/mldata/utils/config.py +++ b/mldata/utils/config.py @@ -54,6 +54,10 @@ def get_dataset_path(dataset_name): else: return path +def dataset_exists(dataset_name): + """ Check if the dataset exists.""" + return _load_config().has_option('datasets', dataset_name) + def _save_config(config): """ Save a config file in the default config file emplacement.""" config.write(expanduser("~")+'.mldataConfig') From 625bfea55409497f526aa9413be0f1908466b4cd Mon Sep 17 00:00:00 2001 From: Adam Salvail Date: Wed, 26 Mar 2014 17:22:50 -0400 Subject: [PATCH 14/65] Added an hash function for easy versioning. --- mldata/dataset.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/mldata/dataset.py b/mldata/dataset.py index 4670fd9..8f1d80d 100644 --- a/mldata/dataset.py +++ b/mldata/dataset.py @@ -1,5 +1,5 @@ """Datasets store the data used for experiments.""" - +import hashlib class Dataset(): """Interface to interact with physical dataset @@ -40,6 +40,14 @@ def __init__(self, meta_data, data, target=None): def __len__(self): return self.meta_data.nb_examples + def __hash__(self): + """ Hash function used for versioning.""" + hasher = hashlib.md5() + hasher.update(self.data) + if self.target is not None: + hasher.update(self.target) + return hasher.hexdigest()[:8] + def get_splits(self): """Return the splits defined by the associated metadata. From 79fc3440fab31e6639a8ddbf7403cf991031c719 Mon Sep 17 00:00:00 2001 From: Adam Salvail Date: Mon, 31 Mar 2014 20:34:34 -0400 Subject: [PATCH 15/65] New version of dataset_store with versioning and metadata supported. --- mldata/dataset_store.py | 165 +++++++++++++++++++++++++++++++--------- 1 file changed, 131 insertions(+), 34 deletions(-) diff --git a/mldata/dataset_store.py b/mldata/dataset_store.py index 17d7dc8..71d5723 100644 --- a/mldata/dataset_store.py +++ b/mldata/dataset_store.py @@ -1,48 +1,145 @@ - +""" Manages dataset read/write operations.""" import os -import h5py -import numpy as np import itertools -import types -import mldata -import mldata.utils -from mldata.dataset import Dataset, LazyDataset +import pickle as pk +import hashlib +import h5py +import numpy as np -from mldata.utils.constants import DATASETS_FOLDER +from mldata.dataset import Dataset, Metadata +import mldata.utils.config as cfg from mldata.utils.utils import buffered_iter -def supervised_factory(examples, targets): - def lazy_iter(): - for e, t in itertools.izip(buffered_iter(examples), buffered_iter(targets)): - yield e, t - lazy_functions = { - '__iter__': lazy_iter, - } +def load(dset_name, version_name="baseDataset", lazy=False): + """ Load a dataset given its name. + + The load function will load the ``Dataset`` ``name`` provided it exists in + one of the datasets folders. This function allows reading of files which + are bigger than available memory using ``h5py``. + + Parameters + ---------- + name : str + The name of the dataset to load. The first match from the list of + dataset folder will be used, thus allowing private copy of a dataset. + lazy : bool + If set to ``True``, the dataset will be read with ``h5py`` without + loading the whole dataset in memory. If set to ``False``, the file is + mapped in memory. + + Returns + ------- + Dataset + Return the loaded dataset, if it exists. Else, return ``None``. + + """ + path = None + if cfg.dataset_exists(dset_name): + path = cfg.get_dataset_path(dset_name) + return _load_from_file(dset_name, path, lazy) + +def save(dataset, version_name="baseDataset"): + """ Save the dataset, manages versions. + + A ``Dataset`` is saved according to its name and the ``version_name`` + provided. The ``version_name`` is used to denote different view of the + data, either using the ``preprocess`` field of a ``Metadata`` class or by + saving a new version of the dataset (with a different hash). The first + method is the most compact while the second method is more efficient when + loading a dataset. + + To save a dataset using the preprocessing method, the dataset *must not* + contain the preprocessed data, but the original dataset on which the + preprocess is applied. + + The dataset is split between two files : + + - the data file ``[hash].data`` + - the metadata file [dataset Name]_[dataset version].meta + + This function will replace a metadata file with the same version name + without prompting. - return lazy_functions + Parameters + ---------- + dataset : Dataset + The dataset to be saved. + version_name : str + If this is a special version of a dataset, -def load(path_or_name, lazy=False): - path = path_or_name - if not os.path.isfile(path): - if (path_or_name + ".h5") not in os.listdir(DATASETS_FOLDER): - print "Unknown dataset: '{0}'".format(path_or_name) - return + """ + dset_name = dataset.meta_data.name - path = os.path.join(DATASETS_FOLDER, path_or_name + ".h5") + if not cfg.dataset_exists(dset_name): + cfg.add_dataset(dset_name) - return _load_from_file(path, lazy) + dset_path = cfg.get_dataset_path(dset_name) -def _load_from_file(path, lazy=False): - dataset = Dataset() - - if not lazy: - with h5py.File(path, mode='r') as f: - dataset = Dataset(itertools.izip(f['input'][()], f['output'][()])) + dset_hash = dataset.__hash__() + dataset.meta_data.hash = dset_hash # insures metadata hash is up to date + + dset_file = dataset.meta_data.hash + ".data" + _save_dataset(dataset, dset_path, dset_file) + + meta_file = dset_name + '_' + version_name + ".meta" + _save_metadata(dataset.meta_data, dset_path, meta_file) + +def _load_from_file(name, path, lazy): + """ Call to ``h5py`` to load the file. + + """ + metadata = None + with open(os.path.join(path, name) + '.meta', 'rb') as f: + metadata = pk.load(f) + + dataset = None + file_to_load = metadata.hash + ".data" + if lazy: + dataset = h5py.File(file_to_load, mode='r', driver=None) else: - f = h5py.File(path, mode='r') - lazy_functions = supervised_factory(f['input'], f['output']) - dataset = LazyDataset(lazy_functions) + dataset = h5py.File(file_to_load, mode='r', driver=core) + + data = dataset['/']["data"] + target = dataset['/']["target"] + + return Dataset(metadata, data, target) + +def _save_dataset(dataset, path, filename): + """Call to ``h5py`` to write the dataset + + Save the dataset and the associated metadata into their respective folder in + the dataset folder. + + Parameters + ---------- + dataset : Dataset + path : str + filename : str + + """ + if filename not in os.listdir(path): + fullname = os.path.join(path, filename) + with h5py.File(fullname, mode='w') as f: + f.create_dataset('data', dataset.data) + if dataset.target is not Null: + f.create_dataset('targets', dataset.target) + +def _save_metadata(metadata, path, filename): + """ Pickle the metadata. + + Parameters + ---------- + metadata : Metadata + path : str + filename : str + + .. todo:: A dataset could be orphaned if overwritten by another metadata + file. This needs to be checked in a future version. + + """ - return dataset + if filename not in os.listdir(path): + with open(os.path.join(path, filename), 'wb') as f: + pk.dump(metadata, f, pk.HIGHEST_PROTOCOL) \ No newline at end of file From f11ba439e19a27cd47597e8f676c53577cb18e4e Mon Sep 17 00:00:00 2001 From: Adam Salvail Date: Thu, 3 Apr 2014 14:35:54 -0400 Subject: [PATCH 16/65] Added an importer for CSV files based on numpy.loadtxt() --- mldata/dataset_store.py | 94 ++++++++++++++++++++++++++++++++++++++--- 1 file changed, 89 insertions(+), 5 deletions(-) diff --git a/mldata/dataset_store.py b/mldata/dataset_store.py index 71d5723..d461323 100644 --- a/mldata/dataset_store.py +++ b/mldata/dataset_store.py @@ -1,8 +1,6 @@ """ Manages dataset read/write operations.""" import os -import itertools import pickle as pk -import hashlib import h5py import numpy as np @@ -24,10 +22,13 @@ def load(dset_name, version_name="baseDataset", lazy=False): name : str The name of the dataset to load. The first match from the list of dataset folder will be used, thus allowing private copy of a dataset. + version_name : str + If this is a special version of a dataset, use this name to indicate + it. Default: "baseDataset". lazy : bool If set to ``True``, the dataset will be read with ``h5py`` without loading the whole dataset in memory. If set to ``False``, the file is - mapped in memory. + mapped in memory. Default: False. Returns ------- @@ -67,7 +68,8 @@ def save(dataset, version_name="baseDataset"): dataset : Dataset The dataset to be saved. version_name : str - If this is a special version of a dataset, + If this is a special version of a dataset, use this name to indicate + it. Default: "baseDataset". """ dset_name = dataset.meta_data.name @@ -142,4 +144,86 @@ def _save_metadata(metadata, path, filename): if filename not in os.listdir(path): with open(os.path.join(path, filename), 'wb') as f: - pk.dump(metadata, f, pk.HIGHEST_PROTOCOL) \ No newline at end of file + pk.dump(metadata, f, pk.HIGHEST_PROTOCOL) + +def CSV_importer(filepath, + name, + splits, + target_column=None, + dtype=np.float64, + comments='#', + delimiter=' ', + converters=None, + skiprows=0, + usecols=None): + """ Import a CSV file into a ``Dataset``. + + From the ``filepath`` of a CSV file (using commas), create a ``Dataset`` + which can then be saved on disk. This importer supports only numbered + inputs (int, float, boolean values). + + Parameters + ---------- + filepath : str + The path of the CSV file to be imported. + name : str + The name of this dataset used to store the ``Dataset`` on disk. + splits : tuple of int + Gives the split of the dataset, like (train, valid, test). The + integers required is the id of the last example of a sub-dataset plus 1. + For example, if there is 8000 examples with 5000 in the training set, + 2000 in the validation set and 1000 in the test set, the splits would be + ``(5000, 7000, 8000)``. + target_column : int, optional + The column number of the target. If no target is provided, set to + ``None``. Default: None. + dtype : data-type, optional + Data-type of the resulting array; default: float. If this is a record + data-type, the resulting array will be 1-dimensional, and each row will + be interpreted as an element of the array. In this case, the number of + columns used must match the number of fields in the data-type. + comments : str, optional + The character used to indicate the start of a comment; default: ‘#’. + delimiter : str, optional + The string used to separate values. By default, this is any whitespace. + converters : dict, optional + A dictionary mapping column number to a function that will convert that + column to a float. E.g., if column 0 is a date string: + ``converters = {0: datestr2num}``. Converters can also be used to + provide a default value for missing data : + ``converters = {3: lambda s: float(s.strip() or 0)}``. Default: None. + skiprows : int, optional + Skip the first skiprows lines; default: 0. + usecols : sequence, optional + Which columns to read, with 0 being the first. For example, + ``usecols = (1,4,5)`` will extract the 2nd, 5th and 6th columns. The + default, None, results in all columns being read. + + Returns + ------- + Dataset + A ``Dataset`` with default values for ``Metadata``. + + """ + data = np.loadtxt(filepath, dtype, comments, delimiter, + converters, skiprows, usecols) + + meta = Metadata() + meta.name = name + meta.splits = splits + assert(len(data) == splits[-1], "The dataset read is not consistent " + "with the split given.") + meta.nb_examples = splits[-1] + + dset = None + if target_column is not None: + targets = data[:, target_column] + examples = data[:, list(range(0,target_column)) + + list(range(target_column+1, data.shape[1]))] + dset = Dataset(meta, examples, targets) + else: + dset = Dataset(meta, data) + + dset.meta_data.hash = dset.__hash__() + + return dset From ab262ff2b47a9ed95a636dbcdd4d941fcd5ca8b0 Mon Sep 17 00:00:00 2001 From: Adam Salvail Date: Thu, 3 Apr 2014 14:37:27 -0400 Subject: [PATCH 17/65] Added comments and new parameters --- mldata/dataset.py | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/mldata/dataset.py b/mldata/dataset.py index 8f1d80d..286ad92 100644 --- a/mldata/dataset.py +++ b/mldata/dataset.py @@ -155,18 +155,20 @@ class Metadata(): Attributes ---------- name : str - The name of the `Dataset` + The name of the `Dataset`. Default: "Default". nb_examples : int - The number of example in the dataset (including all splits). + The number of example in the dataset (including all splits). Default: 0. dictionary : Dictionary Gives a mapping of words (str) to id (int). Used only when the dataset has been saved as an array of numbers instead of text. + Default: None splits : tuple of int - Specifies the split used by this view of the dataset. + Specifies the split used by this view of the dataset. Default: (). preprocess : function or None A function that is callable on a `Dataset` to preprocess the data. - version : int - The version number of the dataset that is required. + Default: ``lambda x: x``. + hash : str + The hash of the linked ``Dataset``. Default: "". """ def __init__(self): @@ -175,7 +177,7 @@ def __init__(self): self.dictionary = None self.splits = () self.preprocess = lambda x: x - self.version = 0 + self.hash = "" class Dictionary: From 00730de85299fb934b15f34f1e80af46e1a4f9d7 Mon Sep 17 00:00:00 2001 From: Adam Salvail Date: Thu, 3 Apr 2014 14:46:12 -0400 Subject: [PATCH 18/65] Efficient buffered iteration added. --- mldata/dataset.py | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/mldata/dataset.py b/mldata/dataset.py index 286ad92..848c864 100644 --- a/mldata/dataset.py +++ b/mldata/dataset.py @@ -1,6 +1,8 @@ """Datasets store the data used for experiments.""" import hashlib +BUFFER_SIZE = 1000 + class Dataset(): """Interface to interact with physical dataset @@ -94,8 +96,11 @@ def apply(self): def _iter_with_target(self): """Provide an iterator when the Dataset has a target.""" - for (ex, tg) in zip(self.data, self.targets): - yield (ex, tg) + buffer = min(BUFFER_SIZE, len(self.data)) + for idx in range(0, len(self.data), buffer): + for ex, tg in zip(self.data[idx:idx+buffer], + self.target[idx:idx+buffer]): + yield (ex,) def _get_with_target(self, key): """Get the entry specified by the key. @@ -118,8 +123,10 @@ def _get_with_target(self, key): def _iter_without_target(self): """Provide an iterator when the Dataset has no target.""" - for ex in self.data: - yield (ex,) + buffer = min(BUFFER_SIZE, len(self.data)) + for idx in range(0, len(self.data), buffer): + for ex in self.data[idx:idx+buffer]: + yield (ex,) def _get_without_target(self, key): """Get the entry specified by the key. From b21420e91d8f466dc2e07fc3f824afaa93e33a88 Mon Sep 17 00:00:00 2001 From: Adam Salvail Date: Thu, 3 Apr 2014 15:34:51 -0400 Subject: [PATCH 19/65] Removed uses of utils/utils --- mldata/dataset_store.py | 1 - mldata/utils/utils.py | 6 ------ 2 files changed, 7 deletions(-) delete mode 100644 mldata/utils/utils.py diff --git a/mldata/dataset_store.py b/mldata/dataset_store.py index d461323..c845fcc 100644 --- a/mldata/dataset_store.py +++ b/mldata/dataset_store.py @@ -7,7 +7,6 @@ from mldata.dataset import Dataset, Metadata import mldata.utils.config as cfg -from mldata.utils.utils import buffered_iter def load(dset_name, version_name="baseDataset", lazy=False): diff --git a/mldata/utils/utils.py b/mldata/utils/utils.py deleted file mode 100644 index 662a006..0000000 --- a/mldata/utils/utils.py +++ /dev/null @@ -1,6 +0,0 @@ - - -def buffered_iter(arr, buffer_size=1000): - for idx in xrange(0, len(arr), buffer_size): - for e in arr[idx:idx+buffer_size]: - yield e \ No newline at end of file From fc9e766d3b2698f462f30c874c35025dd3ecbf6b Mon Sep 17 00:00:00 2001 From: Adam Salvail Date: Thu, 3 Apr 2014 16:52:03 -0400 Subject: [PATCH 20/65] Test suite for Datasets --- tests/test_Dataset.py | 81 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 81 insertions(+) create mode 100644 tests/test_Dataset.py diff --git a/tests/test_Dataset.py b/tests/test_Dataset.py new file mode 100644 index 0000000..0b07ded --- /dev/null +++ b/tests/test_Dataset.py @@ -0,0 +1,81 @@ +import numpy as np +import nose.tools as nt +import copy + +from mldata.dataset import Dataset, Metadata + +class Dataset_test: + def setup_class(self): + self.dataSmall = np.random.rand(30,5) + self.dataLarge = np.random.rand(3000, 5) + self.targetSmall = np.random.rand(30,1) + self.targetLarge = np.random.rand(3000,1) + + self.metadataS = Metadata() + self.metadataS.splits = (10,20,30) + self.metadataS.nb_examples = 30 + self.dsetS = Dataset(self.metadataS,self.dataSmall, self.targetSmall) + + def test_Dataset(self): + dset = Dataset(self.metadataS, self.dataSmall) + nt.assert_equal(dset.meta_data, self.metadataS) + nt.assert_equal(dset.data, self.dataSmall) + nt.assert_is_none(dset.target) + + dsetS = Dataset(self.metadataS, self.dataSmall, self.targetSmall) + nt.assert_is_not_none(dsetL.target) + + def test_hash(self): + nt.assert_equal(self.dsetS.__hash__(), self.dsetS.__hash__()) + + dset = Dataset(self.metadataS, self.dataSmall, self.targetSmall) + nt.assert_equal(dset.__hash__(), self.dsetS.__hash__()) + + dset2 = Dataset(self.metadataS, self.dataSmall) + nt.assert_not_equal(dset2.__hash__(), dset.__hash__()) + + dset3 = Dataset(self.metadataS, self.dataLarge) + nt.assert_not_equal(dset2.__hash__(), dset3.__hash__()) + nt.assert_not_equal(dset3.__hash__(), dset.__hash__()) + + meta = Metadata() + meta.name = "AnotherName" + meta.splits = (10,20,30) + dset4 = Dataset(meta, self.dataSmall) + nt.assert_equal(dset4.__hash__(), dset2.__hash__()) + nt.assert_not_equal(dset4.__hash__(), dset3.__hash__()) + + def test_get_splits(self): + nt.assert_equal(self.dsetS.get_splits(), (10,20,30)) + + def test_len(self): + nt.assert_equal(len(self.dsetS), len(self.dsetS.data)) + nt.assert_equal(len(self.dsetS), self.dsetS.meta_data.nb_examples) + nt.assert_equal(len(self.dsetS), self.dsetS.get_splits()[-1]) + + def test_preprocess(self): + data2 = self.dsetS.data * 2 + meta = copy.deepcopy(self.metadataS) + meta.preprocess = lambda x: Dataset(self.metadataS, x.data * 2, + x.data*2) + dset2 = Dataset(meta, self.dataSmall, self.targetSmall) + + nt.assert_equal(data2, dset2.data) + + def test_iter(self): + # Without targets + dt, tg = [[z[i] for z in self.dsetS] for i in [0,1]] + nt.assert_equal(np.array(dt), self.dataSmall) + + dset = Dataset(self.metadataS, self.dataSmall) + nt.assert_equal(np.array([z[0] for z in dset]), self.dataSmall) + + for a,b in zip(self.dataSmall, self.dsetS): + nt.assert_equal(a,b) + + def test_get(self): + for i in range(0, len(self.dataSmall)): + nt.assert_equal(self.dataSmall[i],self.dsetS[i][0]) + + + From ef46db928cbd6008d9ced8fe792d0a95e5c93ab6 Mon Sep 17 00:00:00 2001 From: Adam Salvail Date: Tue, 8 Apr 2014 16:48:20 -0400 Subject: [PATCH 21/65] Test suite for Config --- tests/utils/test_config.py | 40 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 40 insertions(+) create mode 100644 tests/utils/test_config.py diff --git a/tests/utils/test_config.py b/tests/utils/test_config.py new file mode 100644 index 0000000..75d5b3f --- /dev/null +++ b/tests/utils/test_config.py @@ -0,0 +1,40 @@ +import os + +import configparser as cp +import nose.tools as nt + +import mldata.utils.config as cfg + +def setup_module(): + # save current config file + os.rename(cfg.CONFIGFILE, cfg.CONFIGFILE +".bak") + +def teardown_module(): + # restore config file + os.rename(cfg.CONFIGFILE +".bak", cfg.CONFIGFILE) + +def test_load_config(): + cf = cfg._load_config() + path = os.path.expanduser("~") + '/.datasets' + + nt.assert_equal(path, cf['config']['path']) + nt.assert_equal(path, cfg._load_path()) + nt.assert_true(cp.has_section('dataset')) + +def test_add_remove(): + cfg.add_dataset("test_dataset") + nt.assert_true(cfg.dataset_exists("test_dataset")) + + nt.assert_equal(cfg.get_dataset_path("test_dataset"), + os.join(cfg._load_path(), "test_dataset")) + path = cfg.get_dataset_path("test_dataset") + nt.assert_true(os.path.isdir(path)) + + cfg.remove_dataset("test_dataset") + nt.assert_false(cfg.dataset_exists("test_dataset")) + nt.assert_false(os.path.isdir(path)) + + + + + From c92369984421854024850961ee37fb39a526fd9d Mon Sep 17 00:00:00 2001 From: Adam Salvail Date: Tue, 8 Apr 2014 17:28:05 -0400 Subject: [PATCH 22/65] Test suite for Dataset_store --- tests/test_dataset_store.py | 107 ++++++++++++++++-------------------- 1 file changed, 48 insertions(+), 59 deletions(-) diff --git a/tests/test_dataset_store.py b/tests/test_dataset_store.py index 17e3c50..fa65b11 100644 --- a/tests/test_dataset_store.py +++ b/tests/test_dataset_store.py @@ -1,61 +1,50 @@ -from ipdb import set_trace as dbg - import os -import tempfile -import hashlib + import numpy as np -import itertools -import time -from functools import partial - -from numpy.testing import (assert_equal, - assert_almost_equal, - assert_array_equal, - assert_array_almost_equal, - assert_raises) - - -import mldata -import mldata.dataset_store as dataset_store - -DATA_DIR = os.path.join(os.path.realpath(mldata.__path__[0]), "..", "tests", "data") - -def load_mnist(lazy): - """ - Load mnist dataset from a hdf5 file and test if it matches mlpython's one. - """ - dataset_name = 'mnist' - - start = time.time() - import mlpython.datasets.store as mlstore - mldatasets = mlstore.get_classification_problem(dataset_name, load_to_memory= (not lazy)) - print "mlpython version loaded ({0:.2f}sec).".format(time.time() - start) - - start = time.time() - dataset_name = os.path.join(os.environ['MLPYTHON_DATASET_REPO'], dataset_name + ".h5") - dataset = mldata.dataset_store.load(dataset_name, lazy=lazy) - print "mldata version loaded ({0:.2f}sec).".format(time.time() - start) - - print "Comparing first 1000..." - count = 0 - for (e1, t1), (e2, t2) in itertools.izip(dataset, itertools.chain(*mldatasets)): - #print t1, t2 - assert_array_almost_equal(e1, e2) - assert_equal(t1, t2) - - count += 1 - if count >= 1000: - break - - -def test_load_mnist(): - """ - Load mnist dataset from a hdf5 file and test if it matches mlpython's one. - """ - load_mnist(lazy=False) - -def test_load_mnist_lazy(): - """ - Lazy load mnist dataset from a hdf5 file and test if it matches mlpython's one. - """ - load_mnist(lazy=True) +import nose.tools as nt + +import mldata.dataset_store as ds + +RND_MATRIX = np.random.rand(100,10) + + +def setup_module(): + np.savetxt("test.csv", RND_MATRIX) + +def teardown_module(): + os.remove("test.csv") + +def test_CSV_importer(): + dset = ds.CSV_importer("test.csv", + "test_dset", + (70, 20, 10), + 0) + + nt.assert_equal(RND_MATRIX[:,1:], dset.data) + +def test_save_load(): + dset = ds.CSV_importer("test.csv", + "test_dset", + (70, 20, 10), + 0) + ds.save(dset, "v1") + dset2 = ds.load("test_dset", "v1") + + nt.assert_equal(dset.__hash__(), dset2.__hash__()) + nt.assert_equal(dset.meta_data.name, dset2.meta_data.name) + nt.assert_equal(dset.meta_data.dictionary, dset2.meta_data.dictionary) + nt.assert_equal(dset.meta_data.nb_examples, dset2.meta_data.nb_examples) + nt.assert_equal(dset.meta_data.splits, dset2.meta_data.splits) + nt.assert_equal(dset2.meta_data.hash, dset2.__hash__()) + + dset2.data[0,0] = 2 + + ds.save(dset2, version_name=v2) + dset3 = ds.load("test_dset", "v2") + + nt.assert_not_equal(dset3.__hash__(), dset.__hash__()) + nt.assert_equal(dset3.meta_data.hash, dset3.__hash__()) + nt.assert_equal(dset.meta_data.name, dset3.meta_data.name) + nt.assert_equal(dset.meta_data.dictionary, dset3.meta_data.dictionary) + nt.assert_equal(dset.meta_data.nb_examples, dset3.meta_data.nb_examples) + nt.assert_equal(dset.meta_data.splits, dset3.meta_data.splits) \ No newline at end of file From 4c8b31339adb418f9e612d7931a47cdf84e096a3 Mon Sep 17 00:00:00 2001 From: Adam Salvail Date: Wed, 9 Apr 2014 15:38:14 -0400 Subject: [PATCH 23/65] Corrected saving of default config file --- mldata/utils/config.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/mldata/utils/config.py b/mldata/utils/config.py index 3156d76..7e6ad3a 100644 --- a/mldata/utils/config.py +++ b/mldata/utils/config.py @@ -60,7 +60,8 @@ def dataset_exists(dataset_name): def _save_config(config): """ Save a config file in the default config file emplacement.""" - config.write(expanduser("~")+'.mldataConfig') + with open(CONFIGFILE, 'w') as f: + config.write(f) def _load_config(): From d8b4cfaed555b3a3b890af5b575f1eb1bbff74ff Mon Sep 17 00:00:00 2001 From: Adam Salvail Date: Wed, 9 Apr 2014 16:59:43 -0400 Subject: [PATCH 24/65] Corrected loading of config file --- mldata/utils/config.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/mldata/utils/config.py b/mldata/utils/config.py index 7e6ad3a..e6f00f4 100644 --- a/mldata/utils/config.py +++ b/mldata/utils/config.py @@ -68,7 +68,9 @@ def _load_config(): """ Loads the configuration file for MLData.""" if not os.path.exists(CONFIGFILE): _create_default_config() - return configparser.ConfigParser().read(CONFIGFILE) + cfg = configparser.ConfigParser() + cfg.read(CONFIGFILE) + return cfg def _create_default_config(): """ Build and save a default config file for MLData. From f3ea2a60c95677ee15a2fce260b3e39940efd160 Mon Sep 17 00:00:00 2001 From: Adam Salvail Date: Wed, 9 Apr 2014 17:08:23 -0400 Subject: [PATCH 25/65] Corrected joining of parts in CONFIGFILE --- mldata/utils/config.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mldata/utils/config.py b/mldata/utils/config.py index e6f00f4..0c436b5 100644 --- a/mldata/utils/config.py +++ b/mldata/utils/config.py @@ -1,10 +1,10 @@ import configparser import os -from os.path import expanduser +from os.path import expanduser, join from shutil import rmtree -CONFIGFILE = expanduser("~")+'.mldataConfig' +CONFIGFILE = join(expanduser("~"), '.mldataConfig') def add_dataset(dataset_name): """ Add a dataset to the index.""" From 0ff610ebd8ae3552e8dd2e7947a7b0f74158866c Mon Sep 17 00:00:00 2001 From: Adam Salvail Date: Wed, 9 Apr 2014 17:30:20 -0400 Subject: [PATCH 26/65] Updated to python 3.4 --- .travis.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.travis.yml b/.travis.yml index 59ccca6..7a23e78 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,6 +1,6 @@ language: python python: - - "3.3" + - "3.4" - "2.7" - "2.6" # - "pypy" From 347d24b768fb82b22c60a0a1f9b196ee5802e8d4 Mon Sep 17 00:00:00 2001 From: Adam Salvail Date: Wed, 9 Apr 2014 17:37:13 -0400 Subject: [PATCH 27/65] Small correction to skip a line --- mldata/utils/config.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mldata/utils/config.py b/mldata/utils/config.py index 0c436b5..b486672 100644 --- a/mldata/utils/config.py +++ b/mldata/utils/config.py @@ -85,7 +85,7 @@ def _create_default_config(): cp['datasets'] = {} _save_config(cp) with open(CONFIGFILE, 'a') as f: - f.write("# Datasets path shouldn't be changed manually.") + f.write("# Datasets path shouldn't be changed manually.\n") def _load_path(): """ Load the config file at the default emplacement. From d5adda55fc67c9c9dc30874d356ce708ed2e0508 Mon Sep 17 00:00:00 2001 From: Adam Salvail Date: Wed, 9 Apr 2014 18:07:11 -0400 Subject: [PATCH 28/65] assert syntax correction --- mldata/dataset_store.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mldata/dataset_store.py b/mldata/dataset_store.py index c845fcc..96c6d5f 100644 --- a/mldata/dataset_store.py +++ b/mldata/dataset_store.py @@ -210,8 +210,8 @@ def CSV_importer(filepath, meta = Metadata() meta.name = name meta.splits = splits - assert(len(data) == splits[-1], "The dataset read is not consistent " - "with the split given.") + assert len(data) == splits[-1], "The dataset read is not consistent with " \ + "the split given." meta.nb_examples = splits[-1] dset = None From 2d9b70f1c9eec27b5405a1cb44e43e3174baebed Mon Sep 17 00:00:00 2001 From: Adam Salvail Date: Wed, 9 Apr 2014 18:07:37 -0400 Subject: [PATCH 29/65] Added __init__ files for tests --- tests/__init__.py | 0 tests/utils/__init__.py | 0 2 files changed, 0 insertions(+), 0 deletions(-) create mode 100644 tests/__init__.py create mode 100644 tests/utils/__init__.py diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/utils/__init__.py b/tests/utils/__init__.py new file mode 100644 index 0000000..e69de29 From 52bb43fadb4558fa2de30625063b378c095c944a Mon Sep 17 00:00:00 2001 From: Adam Salvail Date: Wed, 9 Apr 2014 20:02:39 -0400 Subject: [PATCH 30/65] Changed the whole logic of __iter__ and __getitem__ __iter__() and __getitem__ are false methods. They actually are class methods, which makes their definition on the fly quite tricky. The old solution would reassing the correct definition in the __init__ method. However, since it is a class method, it also affect other object (that might have a need of the other iterator or getter). Thus, the best way to make it work is the naive way that checks each time is dataset.target is None. --- mldata/dataset.py | 101 +++++++++++++++++++--------------------------- 1 file changed, 41 insertions(+), 60 deletions(-) diff --git a/mldata/dataset.py b/mldata/dataset.py index 848c864..568641d 100644 --- a/mldata/dataset.py +++ b/mldata/dataset.py @@ -32,13 +32,6 @@ def __init__(self, meta_data, data, target=None): assert isinstance(meta_data, Metadata) self.meta_data = meta_data - if targets is None: - self.__iter__ = self._iter_without_target - self.__getitem__ = self._get_with_target - else: - self.__iter__ = self._iter_with_target - self.__getitem__ = self._get_without_target - def __len__(self): return self.meta_data.nb_examples @@ -50,6 +43,47 @@ def __hash__(self): hasher.update(self.target) return hasher.hexdigest()[:8] + def __iter__(self): + """Provide an iterator when the Dataset has a target. + + ..todo: retest efficiency of this buffering in python3. With zip + being now lazy, it might not be better than the vanilla iter. + + """ + buffer = min(BUFFER_SIZE, len(self.data)) + if self.target is not None: + for idx in range(0, len(self.data), buffer): + for ex, tg in zip(self.data[idx:idx+buffer], + self.target[idx:idx+buffer]): + yield (ex,tg) + else: + for idx in range(0, len(self.data), buffer): + for ex in self.data[idx:idx+buffer]: + yield (ex,) + + def __getitem__(self, key): + """Get the entry specified by the key. + + Parameters + ---------- + key : numpy-like key + The `key` can be a single integer, a slice or a tuple defining + coordinates. Can be treated as a NumPy key. + + Returns + ------- + (array_like, array_like) or (array_like,) + Return the element specified by the key. It can be an array or + simply a scalar of the type defined by the data [and target + arrays]. + The returned values are put in a tuple (data, target) or (data,). + + """ + if self.target is not None: + return (self.data[key], self.target[key]) + else: + return (self.data[key],) + def get_splits(self): """Return the splits defined by the associated metadata. @@ -94,59 +128,6 @@ def apply(self): assert isinstance(ds, Dataset) self = ds - def _iter_with_target(self): - """Provide an iterator when the Dataset has a target.""" - buffer = min(BUFFER_SIZE, len(self.data)) - for idx in range(0, len(self.data), buffer): - for ex, tg in zip(self.data[idx:idx+buffer], - self.target[idx:idx+buffer]): - yield (ex,) - - def _get_with_target(self, key): - """Get the entry specified by the key. - - Parameters - ---------- - key : numpy-like key - The `key` can be a single integer, a slice or a tuple defining - coordinates. Can be treated as a NumPy key. - - Returns - ------- - (array_like, array_like) - Return the element specified by the key. It can be an array or - simply a scalar of the type defined by the data and target arrays. - The returned values are put in a tuple (data, target). - - """ - return (self.data[key], self.target[key]) - - def _iter_without_target(self): - """Provide an iterator when the Dataset has no target.""" - buffer = min(BUFFER_SIZE, len(self.data)) - for idx in range(0, len(self.data), buffer): - for ex in self.data[idx:idx+buffer]: - yield (ex,) - - def _get_without_target(self, key): - """Get the entry specified by the key. - - Parameters - ---------- - key : numpy-like key - The `key` can be a single integer, a slice or a tuple defining - coordinates. Can be treated as a NumPy key. - - Returns - ------- - (array_like, ) - Return the element specified by the key. It can be an array or - simply a scalar of the type defined by the data and target arrays. - The returned values are put in a tuple (data, ). - - """ - return (self.data[key],) - class Metadata(): """Keep track of information about a dataset. From ab0b52b9f2445f896f0976fcf3ace8e68a10f6ab Mon Sep 17 00:00:00 2001 From: Adam Salvail Date: Wed, 9 Apr 2014 20:03:07 -0400 Subject: [PATCH 31/65] Changed test to remove errors when run. --- tests/test_Dataset.py | 22 ++++++++++++---------- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/tests/test_Dataset.py b/tests/test_Dataset.py index 0b07ded..f4a4a4b 100644 --- a/tests/test_Dataset.py +++ b/tests/test_Dataset.py @@ -5,6 +5,7 @@ from mldata.dataset import Dataset, Metadata class Dataset_test: + @classmethod def setup_class(self): self.dataSmall = np.random.rand(30,5) self.dataLarge = np.random.rand(3000, 5) @@ -19,11 +20,11 @@ def setup_class(self): def test_Dataset(self): dset = Dataset(self.metadataS, self.dataSmall) nt.assert_equal(dset.meta_data, self.metadataS) - nt.assert_equal(dset.data, self.dataSmall) + nt.assert_true(np.array_equal(dset.data, self.dataSmall)) nt.assert_is_none(dset.target) dsetS = Dataset(self.metadataS, self.dataSmall, self.targetSmall) - nt.assert_is_not_none(dsetL.target) + nt.assert_is_not_none(dsetS.target) def test_hash(self): nt.assert_equal(self.dsetS.__hash__(), self.dsetS.__hash__()) @@ -60,22 +61,23 @@ def test_preprocess(self): x.data*2) dset2 = Dataset(meta, self.dataSmall, self.targetSmall) - nt.assert_equal(data2, dset2.data) + nt.assert_true(np.array_equal(data2, dset2.data)) def test_iter(self): - # Without targets + # With targets dt, tg = [[z[i] for z in self.dsetS] for i in [0,1]] - nt.assert_equal(np.array(dt), self.dataSmall) - + nt.assert_true(np.array_equal(np.array(dt), self.dataSmall)) + # Without targets dset = Dataset(self.metadataS, self.dataSmall) - nt.assert_equal(np.array([z[0] for z in dset]), self.dataSmall) + nt.assert_true(np.array_equal(np.array([z[0] for z in dset]), + self.dataSmall)) for a,b in zip(self.dataSmall, self.dsetS): - nt.assert_equal(a,b) + nt.assert_true(np.array_equal(a,b)) def test_get(self): - for i in range(0, len(self.dataSmall)): - nt.assert_equal(self.dataSmall[i],self.dsetS[i][0]) + for i in range(len(self.dataSmall)): + nt.assert_true(np.array_equal(self.dataSmall[i],self.dsetS[i][0])) From 1b4b158b447a9f1a883779441680aaa0a11c8485 Mon Sep 17 00:00:00 2001 From: Adam Salvail Date: Tue, 15 Apr 2014 14:49:17 -0400 Subject: [PATCH 32/65] Create the default dataset directory. Insure the dataset folders wouldn't be created if it already existed. --- mldata/utils/config.py | 7 +++++-- tests/utils/test_config.py | 6 +++--- 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/mldata/utils/config.py b/mldata/utils/config.py index b486672..b34a9ca 100644 --- a/mldata/utils/config.py +++ b/mldata/utils/config.py @@ -81,7 +81,10 @@ def _create_default_config(): """ cp = configparser.ConfigParser() - cp['config'] = {'path': expanduser("~")+'/.datasets'} + path = join(expanduser("~"), '.datasets') + if not os.path.isdir(path): + os.mkdir(path) + cp['config'] = {'path': path} cp['datasets'] = {} _save_config(cp) with open(CONFIGFILE, 'a') as f: @@ -99,4 +102,4 @@ def _load_path(): cp = _load_config() path = cp['config']['path'] assert os.path.isdir(path), "Configured path is not a valid directory." - return path \ No newline at end of file + return path diff --git a/tests/utils/test_config.py b/tests/utils/test_config.py index 75d5b3f..ae1f266 100644 --- a/tests/utils/test_config.py +++ b/tests/utils/test_config.py @@ -15,18 +15,18 @@ def teardown_module(): def test_load_config(): cf = cfg._load_config() - path = os.path.expanduser("~") + '/.datasets' + path = os.path.join(os.path.expanduser("~"), '.datasets') nt.assert_equal(path, cf['config']['path']) nt.assert_equal(path, cfg._load_path()) - nt.assert_true(cp.has_section('dataset')) + nt.assert_true(cf.has_section('datasets')) def test_add_remove(): cfg.add_dataset("test_dataset") nt.assert_true(cfg.dataset_exists("test_dataset")) nt.assert_equal(cfg.get_dataset_path("test_dataset"), - os.join(cfg._load_path(), "test_dataset")) + os.path.join(cfg._load_path(), "test_dataset")) path = cfg.get_dataset_path("test_dataset") nt.assert_true(os.path.isdir(path)) From 93425f7d59518d687dfa4f4e5d7af9d93a940b70 Mon Sep 17 00:00:00 2001 From: Adam Salvail Date: Tue, 15 Apr 2014 16:35:53 -0400 Subject: [PATCH 33/65] Changed preprocess logic to follow python's capacities. --- mldata/dataset.py | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/mldata/dataset.py b/mldata/dataset.py index 568641d..fbcde56 100644 --- a/mldata/dataset.py +++ b/mldata/dataset.py @@ -38,9 +38,11 @@ def __len__(self): def __hash__(self): """ Hash function used for versioning.""" hasher = hashlib.md5() - hasher.update(self.data) + for l in self.data: + hasher.update(l.copy()) if self.target is not None: - hasher.update(self.target) + for l in self.target: + hasher.update(l.copy()) return hasher.hexdigest()[:8] def __iter__(self): @@ -123,10 +125,15 @@ def apply(self): work on the data and the targets, leaving the rest intact. Still, as long as the result is still a `Dataset`, `apply` will work. + Returns + ------- + Dataset + The preprocessed dataset. + """ ds = self.meta_data.preprocess(self) assert isinstance(ds, Dataset) - self = ds + return ds class Metadata(): @@ -187,4 +194,4 @@ class Dictionary: def __init__(self): raise NotImplementedError("The class Dictionary is not yet " - "implemented.") \ No newline at end of file + "implemented.") From 571fbf769015d787ebd42ba0c438e9a70d816cff Mon Sep 17 00:00:00 2001 From: Adam Salvail Date: Tue, 15 Apr 2014 16:36:40 -0400 Subject: [PATCH 34/65] Changed preprocess logic to follow python's capacities. --- tests/test_Dataset.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/test_Dataset.py b/tests/test_Dataset.py index f4a4a4b..5990fbd 100644 --- a/tests/test_Dataset.py +++ b/tests/test_Dataset.py @@ -55,12 +55,12 @@ def test_len(self): nt.assert_equal(len(self.dsetS), self.dsetS.get_splits()[-1]) def test_preprocess(self): - data2 = self.dsetS.data * 2 + data2 = self.dataSmall * 2 meta = copy.deepcopy(self.metadataS) - meta.preprocess = lambda x: Dataset(self.metadataS, x.data * 2, - x.data*2) + meta.preprocess = lambda x: Dataset(x.meta_data, x.data * 2, + x.target * 2) dset2 = Dataset(meta, self.dataSmall, self.targetSmall) - + dset2 = dset2.apply() nt.assert_true(np.array_equal(data2, dset2.data)) def test_iter(self): From e6523741fbf6e945bd86e8817ffeb3073a06eab0 Mon Sep 17 00:00:00 2001 From: Adam Salvail Date: Tue, 15 Apr 2014 16:37:03 -0400 Subject: [PATCH 35/65] Corrected a small mistake in the splits of the test case. --- tests/test_dataset_store.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/test_dataset_store.py b/tests/test_dataset_store.py index fa65b11..7902339 100644 --- a/tests/test_dataset_store.py +++ b/tests/test_dataset_store.py @@ -17,10 +17,10 @@ def teardown_module(): def test_CSV_importer(): dset = ds.CSV_importer("test.csv", "test_dset", - (70, 20, 10), + (70, 90, 100), 0) - nt.assert_equal(RND_MATRIX[:,1:], dset.data) + nt.assert_true(np.array_equal(RND_MATRIX[:,1:], dset.data)) def test_save_load(): dset = ds.CSV_importer("test.csv", @@ -47,4 +47,4 @@ def test_save_load(): nt.assert_equal(dset.meta_data.name, dset3.meta_data.name) nt.assert_equal(dset.meta_data.dictionary, dset3.meta_data.dictionary) nt.assert_equal(dset.meta_data.nb_examples, dset3.meta_data.nb_examples) - nt.assert_equal(dset.meta_data.splits, dset3.meta_data.splits) \ No newline at end of file + nt.assert_equal(dset.meta_data.splits, dset3.meta_data.splits) From 8bde933855d06f6a1575a4e0adeacc70045a57a4 Mon Sep 17 00:00:00 2001 From: Adam Salvail Date: Tue, 15 Apr 2014 16:42:48 -0400 Subject: [PATCH 36/65] Removed a nonsensical test. --- tests/test_Dataset.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/tests/test_Dataset.py b/tests/test_Dataset.py index 5990fbd..e1260e2 100644 --- a/tests/test_Dataset.py +++ b/tests/test_Dataset.py @@ -72,9 +72,6 @@ def test_iter(self): nt.assert_true(np.array_equal(np.array([z[0] for z in dset]), self.dataSmall)) - for a,b in zip(self.dataSmall, self.dsetS): - nt.assert_true(np.array_equal(a,b)) - def test_get(self): for i in range(len(self.dataSmall)): nt.assert_true(np.array_equal(self.dataSmall[i],self.dsetS[i][0])) From ef37e3acb9c967b9f07cbb64c49d911140b14019 Mon Sep 17 00:00:00 2001 From: Adam Salvail Date: Tue, 15 Apr 2014 17:02:25 -0400 Subject: [PATCH 37/65] Corrected split argument in a test --- tests/test_dataset_store.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_dataset_store.py b/tests/test_dataset_store.py index 7902339..8074bac 100644 --- a/tests/test_dataset_store.py +++ b/tests/test_dataset_store.py @@ -25,7 +25,7 @@ def test_CSV_importer(): def test_save_load(): dset = ds.CSV_importer("test.csv", "test_dset", - (70, 20, 10), + (70, 90, 100), 0) ds.save(dset, "v1") dset2 = ds.load("test_dset", "v1") From 62ff2a77807d26c2113176e7b4808fd8df6a04b6 Mon Sep 17 00:00:00 2001 From: Adam Salvail Date: Tue, 15 Apr 2014 17:02:45 -0400 Subject: [PATCH 38/65] Corrected how h5py is called to store a ndarray. --- mldata/dataset_store.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/mldata/dataset_store.py b/mldata/dataset_store.py index 96c6d5f..f42af33 100644 --- a/mldata/dataset_store.py +++ b/mldata/dataset_store.py @@ -123,9 +123,9 @@ def _save_dataset(dataset, path, filename): if filename not in os.listdir(path): fullname = os.path.join(path, filename) with h5py.File(fullname, mode='w') as f: - f.create_dataset('data', dataset.data) - if dataset.target is not Null: - f.create_dataset('targets', dataset.target) + f.create_dataset('data', data=dataset.data) + if dataset.target is not None: + f.create_dataset('targets', data=dataset.target) def _save_metadata(metadata, path, filename): """ Pickle the metadata. From 035ba7dd91769ded9b40b442dea8fe77572a8723 Mon Sep 17 00:00:00 2001 From: Adam Salvail Date: Tue, 15 Apr 2014 17:12:06 -0400 Subject: [PATCH 39/65] Small corrections in tests --- tests/test_Dataset.py | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/tests/test_Dataset.py b/tests/test_Dataset.py index e1260e2..11ba99b 100644 --- a/tests/test_Dataset.py +++ b/tests/test_Dataset.py @@ -4,18 +4,19 @@ from mldata.dataset import Dataset, Metadata + class Dataset_test: @classmethod def setup_class(self): - self.dataSmall = np.random.rand(30,5) + self.dataSmall = np.random.rand(30, 5) self.dataLarge = np.random.rand(3000, 5) - self.targetSmall = np.random.rand(30,1) - self.targetLarge = np.random.rand(3000,1) + self.targetSmall = np.random.rand(30, 1) + self.targetLarge = np.random.rand(3000, 1) self.metadataS = Metadata() - self.metadataS.splits = (10,20,30) + self.metadataS.splits = (10, 20, 30) self.metadataS.nb_examples = 30 - self.dsetS = Dataset(self.metadataS,self.dataSmall, self.targetSmall) + self.dsetS = Dataset(self.metadataS, self.dataSmall, self.targetSmall) def test_Dataset(self): dset = Dataset(self.metadataS, self.dataSmall) @@ -41,13 +42,13 @@ def test_hash(self): meta = Metadata() meta.name = "AnotherName" - meta.splits = (10,20,30) + meta.splits = (10, 20, 30) dset4 = Dataset(meta, self.dataSmall) nt.assert_equal(dset4.__hash__(), dset2.__hash__()) nt.assert_not_equal(dset4.__hash__(), dset3.__hash__()) def test_get_splits(self): - nt.assert_equal(self.dsetS.get_splits(), (10,20,30)) + nt.assert_equal(self.dsetS.get_splits(), (10, 20, 30)) def test_len(self): nt.assert_equal(len(self.dsetS), len(self.dsetS.data)) @@ -65,7 +66,7 @@ def test_preprocess(self): def test_iter(self): # With targets - dt, tg = [[z[i] for z in self.dsetS] for i in [0,1]] + dt, tg = [[z[i] for z in self.dsetS] for i in [0, 1]] nt.assert_true(np.array_equal(np.array(dt), self.dataSmall)) # Without targets dset = Dataset(self.metadataS, self.dataSmall) @@ -74,7 +75,7 @@ def test_iter(self): def test_get(self): for i in range(len(self.dataSmall)): - nt.assert_true(np.array_equal(self.dataSmall[i],self.dsetS[i][0])) + nt.assert_true(np.array_equal(self.dataSmall[i], self.dsetS[i][0])) From 85bae5046271eea7b946525fc582c8d282ba9218 Mon Sep 17 00:00:00 2001 From: Adam Salvail Date: Tue, 15 Apr 2014 17:20:07 -0400 Subject: [PATCH 40/65] Changed preprocess functions to named function as lambdas can't be pickled. --- mldata/dataset.py | 7 +++++-- tests/test_Dataset.py | 7 ++++--- 2 files changed, 9 insertions(+), 5 deletions(-) diff --git a/mldata/dataset.py b/mldata/dataset.py index fbcde56..21acd5a 100644 --- a/mldata/dataset.py +++ b/mldata/dataset.py @@ -161,7 +161,8 @@ class Metadata(): Specifies the split used by this view of the dataset. Default: (). preprocess : function or None A function that is callable on a `Dataset` to preprocess the data. - Default: ``lambda x: x``. + The function cannot be a lambda function since those can't be pickled. + Default: identity function. hash : str The hash of the linked ``Dataset``. Default: "". @@ -171,9 +172,11 @@ def __init__(self): self.nb_examples = 0 self.dictionary = None self.splits = () - self.preprocess = lambda x: x + self.preprocess = default_preprocess self.hash = "" +def default_preprocess(dset): + return dset class Dictionary: """Word / integer association list diff --git a/tests/test_Dataset.py b/tests/test_Dataset.py index 11ba99b..d271b1a 100644 --- a/tests/test_Dataset.py +++ b/tests/test_Dataset.py @@ -58,8 +58,7 @@ def test_len(self): def test_preprocess(self): data2 = self.dataSmall * 2 meta = copy.deepcopy(self.metadataS) - meta.preprocess = lambda x: Dataset(x.meta_data, x.data * 2, - x.target * 2) + meta.preprocess = double_dset dset2 = Dataset(meta, self.dataSmall, self.targetSmall) dset2 = dset2.apply() nt.assert_true(np.array_equal(data2, dset2.data)) @@ -77,5 +76,7 @@ def test_get(self): for i in range(len(self.dataSmall)): nt.assert_true(np.array_equal(self.dataSmall[i], self.dsetS[i][0])) - +def double_dset(dset): + """ Basic preprocessing function. """ + return Dataset(dset.meta_data, dset.data * 2, dset.target * 2) From 75bc15cbadc975130720fb742aee90fc9d162884 Mon Sep 17 00:00:00 2001 From: Adam Salvail Date: Tue, 15 Apr 2014 17:54:52 -0400 Subject: [PATCH 41/65] Corrected tests to reflect read-only datasets --- tests/test_dataset_store.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/tests/test_dataset_store.py b/tests/test_dataset_store.py index 8074bac..1bf1cc7 100644 --- a/tests/test_dataset_store.py +++ b/tests/test_dataset_store.py @@ -37,9 +37,10 @@ def test_save_load(): nt.assert_equal(dset.meta_data.splits, dset2.meta_data.splits) nt.assert_equal(dset2.meta_data.hash, dset2.__hash__()) - dset2.data[0,0] = 2 + ndata = np.array(dset2.data) + dset2.data = ndata * 2 - ds.save(dset2, version_name=v2) + ds.save(dset2, version_name="v2") dset3 = ds.load("test_dset", "v2") nt.assert_not_equal(dset3.__hash__(), dset.__hash__()) From b9d5ef4ab24b5bf52270d02fb073053b9189b064 Mon Sep 17 00:00:00 2001 From: Adam Salvail Date: Tue, 15 Apr 2014 17:55:25 -0400 Subject: [PATCH 42/65] Corrected errors in dataset loading --- mldata/dataset_store.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/mldata/dataset_store.py b/mldata/dataset_store.py index f42af33..c146990 100644 --- a/mldata/dataset_store.py +++ b/mldata/dataset_store.py @@ -38,7 +38,7 @@ def load(dset_name, version_name="baseDataset", lazy=False): path = None if cfg.dataset_exists(dset_name): path = cfg.get_dataset_path(dset_name) - return _load_from_file(dset_name, path, lazy) + return _load_from_file(dset_name + '_' + version_name, path, lazy) def save(dataset, version_name="baseDataset"): """ Save the dataset, manages versions. @@ -96,14 +96,14 @@ def _load_from_file(name, path, lazy): metadata = pk.load(f) dataset = None - file_to_load = metadata.hash + ".data" + file_to_load = os.path.join(path, metadata.hash + ".data") if lazy: dataset = h5py.File(file_to_load, mode='r', driver=None) else: - dataset = h5py.File(file_to_load, mode='r', driver=core) + dataset = h5py.File(file_to_load, mode='r', driver='core') data = dataset['/']["data"] - target = dataset['/']["target"] + target = dataset['/']["targets"] return Dataset(metadata, data, target) From 8010e60421b6c6e27ef1d5e9ab7c914871c2ede8 Mon Sep 17 00:00:00 2001 From: Adam Salvail Date: Tue, 15 Apr 2014 18:07:45 -0400 Subject: [PATCH 43/65] Added a method to remove a dataset. --- mldata/dataset_store.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/mldata/dataset_store.py b/mldata/dataset_store.py index c146990..70e28b2 100644 --- a/mldata/dataset_store.py +++ b/mldata/dataset_store.py @@ -226,3 +226,14 @@ def CSV_importer(filepath, dset.meta_data.hash = dset.__hash__() return dset + +def remove(name): + """ Remove a dataset from the datasets folder. + + Parameters + ---------- + name : str + Name of the dataset to delete. + + """ + cfg.remove_dataset(name) From 594b4f52b521fd20b647019518c9a46b8f2ff442 Mon Sep 17 00:00:00 2001 From: Adam Salvail Date: Tue, 15 Apr 2014 18:08:05 -0400 Subject: [PATCH 44/65] Insured datasets were cleaned after tests. --- tests/test_dataset_store.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/test_dataset_store.py b/tests/test_dataset_store.py index 1bf1cc7..824854a 100644 --- a/tests/test_dataset_store.py +++ b/tests/test_dataset_store.py @@ -13,6 +13,7 @@ def setup_module(): def teardown_module(): os.remove("test.csv") + ds.remove("test_dset") def test_CSV_importer(): dset = ds.CSV_importer("test.csv", From 145ecb6a903f8211a12883bf3e491003d95f2585 Mon Sep 17 00:00:00 2001 From: Adam Salvail Date: Tue, 15 Apr 2014 18:08:35 -0400 Subject: [PATCH 45/65] Corrected hashing of function to account for the difference between h5py and numpy arrays. --- mldata/dataset.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/mldata/dataset.py b/mldata/dataset.py index 21acd5a..dbcc817 100644 --- a/mldata/dataset.py +++ b/mldata/dataset.py @@ -1,6 +1,8 @@ """Datasets store the data used for experiments.""" import hashlib +import numpy as np + BUFFER_SIZE = 1000 class Dataset(): @@ -39,10 +41,10 @@ def __hash__(self): """ Hash function used for versioning.""" hasher = hashlib.md5() for l in self.data: - hasher.update(l.copy()) + hasher.update(np.array(l)) if self.target is not None: for l in self.target: - hasher.update(l.copy()) + hasher.update(np.array(l)) return hasher.hexdigest()[:8] def __iter__(self): From aa79e28e57edb693a2344c33ad715a305ea34501 Mon Sep 17 00:00:00 2001 From: Adam Salvail Date: Tue, 15 Apr 2014 18:10:22 -0400 Subject: [PATCH 46/65] Module docstring --- mldata/utils/config.py | 1 + 1 file changed, 1 insertion(+) diff --git a/mldata/utils/config.py b/mldata/utils/config.py index b34a9ca..193e85a 100644 --- a/mldata/utils/config.py +++ b/mldata/utils/config.py @@ -1,3 +1,4 @@ +""" Manages the configuration file for MLData.""" import configparser import os from os.path import expanduser, join From bb7086bc51de38d6e31448a6849e29f2947de0ac Mon Sep 17 00:00:00 2001 From: Adam Salvail Date: Tue, 15 Apr 2014 18:16:12 -0400 Subject: [PATCH 47/65] edited todos comments --- mldata/dataset.py | 8 ++------ mldata/dataset_store.py | 6 ++---- 2 files changed, 4 insertions(+), 10 deletions(-) diff --git a/mldata/dataset.py b/mldata/dataset.py index dbcc817..cd5bd46 100644 --- a/mldata/dataset.py +++ b/mldata/dataset.py @@ -48,12 +48,8 @@ def __hash__(self): return hasher.hexdigest()[:8] def __iter__(self): - """Provide an iterator when the Dataset has a target. - - ..todo: retest efficiency of this buffering in python3. With zip - being now lazy, it might not be better than the vanilla iter. - - """ + """Provide an iterator when the Dataset has a target.""" + #todo: retest efficiency of this buffering in python3. With zip being now lazy, it might not be better than the vanilla iter. buffer = min(BUFFER_SIZE, len(self.data)) if self.target is not None: for idx in range(0, len(self.data), buffer): diff --git a/mldata/dataset_store.py b/mldata/dataset_store.py index 70e28b2..19a98ce 100644 --- a/mldata/dataset_store.py +++ b/mldata/dataset_store.py @@ -1,4 +1,5 @@ """ Manages dataset read/write operations.""" +#todo: Remove precise versions of datasets and manage dependencies. import os import pickle as pk @@ -136,11 +137,8 @@ def _save_metadata(metadata, path, filename): path : str filename : str - .. todo:: A dataset could be orphaned if overwritten by another metadata - file. This needs to be checked in a future version. - """ - + #todo: A dataset could be orphaned if overwritten by another metadata file. This needs to be checked in a future version. if filename not in os.listdir(path): with open(os.path.join(path, filename), 'wb') as f: pk.dump(metadata, f, pk.HIGHEST_PROTOCOL) From a3010fa95a48616aca9a76beddee811872c72018 Mon Sep 17 00:00:00 2001 From: ASalvail Date: Tue, 15 Apr 2014 20:59:09 -0400 Subject: [PATCH 48/65] Rollback to python 3.3 for CI. --- .travis.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.travis.yml b/.travis.yml index 7a23e78..59ccca6 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,6 +1,6 @@ language: python python: - - "3.4" + - "3.3" - "2.7" - "2.6" # - "pypy" From e39be557116a1d57d27baa07328dbbfc41751004 Mon Sep 17 00:00:00 2001 From: ASalvail Date: Tue, 15 Apr 2014 21:00:36 -0400 Subject: [PATCH 49/65] Added a check for config file existence --- tests/utils/test_config.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/utils/test_config.py b/tests/utils/test_config.py index ae1f266..480c0d9 100644 --- a/tests/utils/test_config.py +++ b/tests/utils/test_config.py @@ -7,7 +7,8 @@ def setup_module(): # save current config file - os.rename(cfg.CONFIGFILE, cfg.CONFIGFILE +".bak") + if os.path.isfile(cfg.CONFIGFILE): + os.rename(cfg.CONFIGFILE, cfg.CONFIGFILE +".bak") def teardown_module(): # restore config file From ff6c658488e1d801aaceb4af2d686282c37e1f77 Mon Sep 17 00:00:00 2001 From: ASalvail Date: Tue, 15 Apr 2014 21:01:04 -0400 Subject: [PATCH 50/65] Rename test_Dataset.py to test_dataset.py --- tests/{test_Dataset.py => test_dataset.py} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename tests/{test_Dataset.py => test_dataset.py} (100%) diff --git a/tests/test_Dataset.py b/tests/test_dataset.py similarity index 100% rename from tests/test_Dataset.py rename to tests/test_dataset.py From 2db5970f48d59a860c79efdc4ff8ea0d0f6e858c Mon Sep 17 00:00:00 2001 From: ASalvail Date: Tue, 15 Apr 2014 21:02:01 -0400 Subject: [PATCH 51/65] Corrected typo in config.py --- mldata/utils/config.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mldata/utils/config.py b/mldata/utils/config.py index 193e85a..c897e7c 100644 --- a/mldata/utils/config.py +++ b/mldata/utils/config.py @@ -76,7 +76,7 @@ def _load_config(): def _create_default_config(): """ Build and save a default config file for MLData. - The default config is saved as ``.MLDataConfig`` in the ``$HOME`` folder + The default config is saved as ``.mldataConfig`` in the ``$HOME`` folder or its equivalent. It stores the emplacement of dataset files and make an index of accessible datasets. From 6af517a39b316e1bb9a8993dd144ab5062fe89bb Mon Sep 17 00:00:00 2001 From: Adam Salvail Date: Wed, 16 Apr 2014 11:03:19 -0400 Subject: [PATCH 52/65] Added a LookupError to handle missing datasets. --- mldata/dataset_store.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/mldata/dataset_store.py b/mldata/dataset_store.py index 19a98ce..d9b1e62 100644 --- a/mldata/dataset_store.py +++ b/mldata/dataset_store.py @@ -35,10 +35,18 @@ def load(dset_name, version_name="baseDataset", lazy=False): Dataset Return the loaded dataset, if it exists. Else, return ``None``. + Raises + ------ + LookupError + If the dataset ``dset_name`` does not exist, a ``LookupError`` is + raised. + """ path = None if cfg.dataset_exists(dset_name): path = cfg.get_dataset_path(dset_name) + else: + raise LookupError("This dataset does not exist.") return _load_from_file(dset_name + '_' + version_name, path, lazy) def save(dataset, version_name="baseDataset"): From 9e0b80ca9b0c80e39382b2bbd68e186dfc3a93b9 Mon Sep 17 00:00:00 2001 From: Adam Salvail Date: Wed, 16 Apr 2014 11:06:09 -0400 Subject: [PATCH 53/65] Added a LookupError to handle missing datasets_versions. --- mldata/dataset_store.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/mldata/dataset_store.py b/mldata/dataset_store.py index d9b1e62..5734330 100644 --- a/mldata/dataset_store.py +++ b/mldata/dataset_store.py @@ -101,8 +101,11 @@ def _load_from_file(name, path, lazy): """ metadata = None - with open(os.path.join(path, name) + '.meta', 'rb') as f: - metadata = pk.load(f) + try: + with open(os.path.join(path, name) + '.meta', 'rb') as f: + metadata = pk.load(f) + except FileNotFoundError: + raise LookupError("This dataset/version pair does not exist : " + name) dataset = None file_to_load = os.path.join(path, metadata.hash + ".data") From 2411f40eb1264c2b0153271d2df758c3224dfc84 Mon Sep 17 00:00:00 2001 From: Adam Salvail Date: Wed, 16 Apr 2014 11:11:23 -0400 Subject: [PATCH 54/65] Added test to make sure missing datasets are handled properly. --- tests/test_dataset_store.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/tests/test_dataset_store.py b/tests/test_dataset_store.py index 824854a..fb273ea 100644 --- a/tests/test_dataset_store.py +++ b/tests/test_dataset_store.py @@ -50,3 +50,12 @@ def test_save_load(): nt.assert_equal(dset.meta_data.dictionary, dset3.meta_data.dictionary) nt.assert_equal(dset.meta_data.nb_examples, dset3.meta_data.nb_examples) nt.assert_equal(dset.meta_data.splits, dset3.meta_data.splits) + + # handle missing datasets + with nt.assert_raises(LookupError): + ds.load("inexistant_dataset") + + with nt.assert_raises(LookupError): + ds.load("test_dset", "v3") + + From e62380320331442d0540036443f7e35bb7c869a7 Mon Sep 17 00:00:00 2001 From: Adam Salvail Date: Wed, 16 Apr 2014 11:36:11 -0400 Subject: [PATCH 55/65] Changed ``splits`` logic. We can now either gave the ``splits`` be given in the form (nb_train, ..., nb_test) or (nb_train, ..., nb_train + ... + nb_valid) --- mldata/dataset.py | 10 ++++++++++ mldata/dataset_store.py | 9 ++++++--- tests/test_dataset.py | 8 ++++++-- 3 files changed, 22 insertions(+), 5 deletions(-) diff --git a/mldata/dataset.py b/mldata/dataset.py index cd5bd46..0418fc0 100644 --- a/mldata/dataset.py +++ b/mldata/dataset.py @@ -29,6 +29,13 @@ class Dataset(): """ def __init__(self, meta_data, data, target=None): + assert(len(data) == meta_data.nb_examples, + "The metadata ``nb_examples`` is inconsistent with the length of " + "the dataset.") + assert(len(data) == meta_data.splits[-1] or + len(data) == sum(meta_data.splits), + "The metadata ``splits`` is inconsistent with the length of the " + "dataset.") self.data = data self.target = target assert isinstance(meta_data, Metadata) @@ -152,11 +159,14 @@ class Metadata(): nb_examples : int The number of example in the dataset (including all splits). Default: 0. dictionary : Dictionary + _Not yet implemented_ Gives a mapping of words (str) to id (int). Used only when the dataset has been saved as an array of numbers instead of text. Default: None splits : tuple of int Specifies the split used by this view of the dataset. Default: (). + The numbers can be either the number of the last examples in each + subsets or the number of examples in each categories. preprocess : function or None A function that is callable on a `Dataset` to preprocess the data. The function cannot be a lambda function since those can't be pickled. diff --git a/mldata/dataset_store.py b/mldata/dataset_store.py index 5734330..4502da7 100644 --- a/mldata/dataset_store.py +++ b/mldata/dataset_store.py @@ -182,6 +182,8 @@ def CSV_importer(filepath, For example, if there is 8000 examples with 5000 in the training set, 2000 in the validation set and 1000 in the test set, the splits would be ``(5000, 7000, 8000)``. + An alternative form where each numbers represent the count of each + subsets is also supported. target_column : int, optional The column number of the target. If no target is provided, set to ``None``. Default: None. @@ -219,9 +221,10 @@ def CSV_importer(filepath, meta = Metadata() meta.name = name meta.splits = splits - assert len(data) == splits[-1], "The dataset read is not consistent with " \ - "the split given." - meta.nb_examples = splits[-1] + assert(len(data) == splits[-1] or + len(data) == sum(splits), + "The dataset read is not consistent with the split given.") + meta.nb_examples = len(data) dset = None if target_column is not None: diff --git a/tests/test_dataset.py b/tests/test_dataset.py index d271b1a..84b5237 100644 --- a/tests/test_dataset.py +++ b/tests/test_dataset.py @@ -16,6 +16,9 @@ def setup_class(self): self.metadataS = Metadata() self.metadataS.splits = (10, 20, 30) self.metadataS.nb_examples = 30 + self.metadataL = Metadata() + self.metadataL.splits = (1000, 2000, 3000) + self.metadataL.nb_examples = 3000 self.dsetS = Dataset(self.metadataS, self.dataSmall, self.targetSmall) def test_Dataset(self): @@ -36,13 +39,14 @@ def test_hash(self): dset2 = Dataset(self.metadataS, self.dataSmall) nt.assert_not_equal(dset2.__hash__(), dset.__hash__()) - dset3 = Dataset(self.metadataS, self.dataLarge) + dset3 = Dataset(self.metadataL, self.dataLarge) nt.assert_not_equal(dset2.__hash__(), dset3.__hash__()) nt.assert_not_equal(dset3.__hash__(), dset.__hash__()) meta = Metadata() meta.name = "AnotherName" - meta.splits = (10, 20, 30) + meta.splits = (10, 10, 10) # alternative split form + meta.nb_examples = 30 dset4 = Dataset(meta, self.dataSmall) nt.assert_equal(dset4.__hash__(), dset2.__hash__()) nt.assert_not_equal(dset4.__hash__(), dset3.__hash__()) From 29c723ecdb67b86574e1be3a615ceca9da398442 Mon Sep 17 00:00:00 2001 From: Adam Salvail Date: Wed, 16 Apr 2014 12:38:58 -0400 Subject: [PATCH 56/65] Added a split iterator. Creates a tuple, each containing a generator over a part of the dataset, following the given splits. --- mldata/dataset.py | 78 ++++++++++++++++++++++++++++--------------- tests/test_dataset.py | 19 ++++++++--- 2 files changed, 65 insertions(+), 32 deletions(-) diff --git a/mldata/dataset.py b/mldata/dataset.py index 0418fc0..a81a72d 100644 --- a/mldata/dataset.py +++ b/mldata/dataset.py @@ -1,4 +1,5 @@ """Datasets store the data used for experiments.""" +from itertools import accumulate import hashlib import numpy as np @@ -55,17 +56,19 @@ def __hash__(self): return hasher.hexdigest()[:8] def __iter__(self): - """Provide an iterator when the Dataset has a target.""" + """Provide an iterator handling if the Dataset has a target.""" #todo: retest efficiency of this buffering in python3. With zip being now lazy, it might not be better than the vanilla iter. - buffer = min(BUFFER_SIZE, len(self.data)) + buffer = min(BUFFER_SIZE, len(self)) if self.target is not None: for idx in range(0, len(self.data), buffer): - for ex, tg in zip(self.data[idx:idx+buffer], - self.target[idx:idx+buffer]): + stop = min(idx + buffer, len(self)) + for ex, tg in zip(self.data[idx:stop], + self.target[idx:stop]): yield (ex,tg) else: for idx in range(0, len(self.data), buffer): - for ex in self.data[idx:idx+buffer]: + stop = min(idx + buffer, len(self)) + for ex in self.data[idx:stop]: yield (ex,) def __getitem__(self, key): @@ -91,36 +94,57 @@ def __getitem__(self, key): else: return (self.data[key],) - def get_splits(self): - """Return the splits defined by the associated metadata. + def _split_iterators(self, start, end): + """ Iterate on a split. - The split is given via a tuple of integer with each integers - representing the integer after the last id used by this split. For - example:: + Parameters + ---------- + start : int + Id of the first element of the split. + end : int + Id of the next element after the last. + + """ + buffer = min(BUFFER_SIZE, end - start) + if self.target is not None: + for idx in range(start, end, buffer): + stop = min(idx+buffer, end) + for ex, tg in zip(self.data[idx:stop], + self.target[idx:stop]): + yield (ex,tg) + else: + for idx in range(start, end, buffer): + stop = min(idx+buffer, end) + for ex in self.data[idx:stop]: + yield (ex,) - (5000, 6000, 7000) + def get_splits_iterators(self): + """ Creates a tuple of iterator, each iterating on a split. - would give a test set of all examples from 0 to 4999, a validation - set of examples 5000 to 5999 and a test set of examples 6000 up to - 6999. This means that 7000 is also the number of examples in the - dataset. + Each iterators returned is used to iterate over the corresponding + split. For example, if the ``Metadata`` specifies a ``splits`` of + (10, 20, 30), ``get_splits_iterators`` returns a 3-tuple with an + iterator for the ten first examples, another for the ten next and a + third for the ten lasts. Returns ------- - tuple of int - Where each integer gives the id of the example coming after the - last one in a split. - - Notes - ----- - For now, only a tuple is accepted. Eventually, predicates over the - examples id could be supported. + tuple of iterable + A tuple of iterator, one for each split. """ - if isinstance(self.meta_data.splits, tuple): - return self.meta_data.splits - else: - raise NotImplementedError("Only splits with tuple are supported.") + sp = list(self.meta_data.splits) + + # normalize the splits< + if sum(sp) == len(self): + sp = list(accumulate(sp)) + assert(sp[-1] == len(self), + "The splits couldn't be normalized") + + itors = [] + for start, end in zip([0] + sp, sp): + itors.append(self._split_iterators(start, end)) + return itors def apply(self): """Apply the preprocess specified in the associated metadata. diff --git a/tests/test_dataset.py b/tests/test_dataset.py index 84b5237..30ddcb9 100644 --- a/tests/test_dataset.py +++ b/tests/test_dataset.py @@ -1,6 +1,7 @@ import numpy as np import nose.tools as nt import copy +from itertools import chain, accumulate from mldata.dataset import Dataset, Metadata @@ -17,9 +18,10 @@ def setup_class(self): self.metadataS.splits = (10, 20, 30) self.metadataS.nb_examples = 30 self.metadataL = Metadata() - self.metadataL.splits = (1000, 2000, 3000) + self.metadataL.splits = (1000, 1000, 1000) self.metadataL.nb_examples = 3000 self.dsetS = Dataset(self.metadataS, self.dataSmall, self.targetSmall) + self.dsetL = Dataset(self.metadataL, self.dataLarge, self.targetLarge) def test_Dataset(self): dset = Dataset(self.metadataS, self.dataSmall) @@ -51,13 +53,9 @@ def test_hash(self): nt.assert_equal(dset4.__hash__(), dset2.__hash__()) nt.assert_not_equal(dset4.__hash__(), dset3.__hash__()) - def test_get_splits(self): - nt.assert_equal(self.dsetS.get_splits(), (10, 20, 30)) - def test_len(self): nt.assert_equal(len(self.dsetS), len(self.dsetS.data)) nt.assert_equal(len(self.dsetS), self.dsetS.meta_data.nb_examples) - nt.assert_equal(len(self.dsetS), self.dsetS.get_splits()[-1]) def test_preprocess(self): data2 = self.dataSmall * 2 @@ -80,6 +78,17 @@ def test_get(self): for i in range(len(self.dataSmall)): nt.assert_true(np.array_equal(self.dataSmall[i], self.dsetS[i][0])) + def test_get_splits_iterators(self): + citer = chain.from_iterable(self.dsetS.get_splits_iterators()) + for a, b in zip(citer, self.dsetS): + d1 = a[0] + d2 = b[0] + nt.assert_true(np.array_equal(d1,d2)) + + sp = self.dsetL.meta_data.splits + for splitn, it in zip(sp, self.dsetL.get_splits_iterators()): + nt.assert_equal(sum(1 for _ in it), splitn) + def double_dset(dset): """ Basic preprocessing function. """ return Dataset(dset.meta_data, dset.data * 2, dset.target * 2) From 503be663d87df5700fd677b676b68b142c0b6bbc Mon Sep 17 00:00:00 2001 From: Adam Salvail Date: Wed, 16 Apr 2014 14:21:37 -0400 Subject: [PATCH 57/65] Added a lazy read test. --- tests/test_dataset_store.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_dataset_store.py b/tests/test_dataset_store.py index fb273ea..0ca1c5f 100644 --- a/tests/test_dataset_store.py +++ b/tests/test_dataset_store.py @@ -42,7 +42,7 @@ def test_save_load(): dset2.data = ndata * 2 ds.save(dset2, version_name="v2") - dset3 = ds.load("test_dset", "v2") + dset3 = ds.load("test_dset", "v2",lazy=True) nt.assert_not_equal(dset3.__hash__(), dset.__hash__()) nt.assert_equal(dset3.meta_data.hash, dset3.__hash__()) From 8e99c2101cd014e1dbdaebe7c2ba0820bbd988a8 Mon Sep 17 00:00:00 2001 From: Adam Salvail Date: Fri, 25 Apr 2014 13:45:08 -0400 Subject: [PATCH 58/65] Corrected an assert statement --- mldata/dataset_store.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/mldata/dataset_store.py b/mldata/dataset_store.py index 4502da7..3919146 100644 --- a/mldata/dataset_store.py +++ b/mldata/dataset_store.py @@ -221,9 +221,9 @@ def CSV_importer(filepath, meta = Metadata() meta.name = name meta.splits = splits - assert(len(data) == splits[-1] or - len(data) == sum(splits), - "The dataset read is not consistent with the split given.") + assert len(data) == splits[-1] or \ + len(data) == sum(splits),\ + "The dataset read is not consistent with the split given." meta.nb_examples = len(data) dset = None From a59492dc8bf317966989ed094e40a80d6bd4ec2a Mon Sep 17 00:00:00 2001 From: Adam Salvail Date: Fri, 25 Apr 2014 14:18:27 -0400 Subject: [PATCH 59/65] Close h5py File handle and correct noTarget dset --- mldata/dataset_store.py | 27 +++++++++++++++++++++------ tests/test_dataset_store.py | 7 +++++++ 2 files changed, 28 insertions(+), 6 deletions(-) diff --git a/mldata/dataset_store.py b/mldata/dataset_store.py index 3919146..cf22858 100644 --- a/mldata/dataset_store.py +++ b/mldata/dataset_store.py @@ -107,17 +107,23 @@ def _load_from_file(name, path, lazy): except FileNotFoundError: raise LookupError("This dataset/version pair does not exist : " + name) - dataset = None + datasetFile = None file_to_load = os.path.join(path, metadata.hash + ".data") if lazy: - dataset = h5py.File(file_to_load, mode='r', driver=None) + datasetFile = h5py.File(file_to_load, mode='r', driver=None) else: - dataset = h5py.File(file_to_load, mode='r', driver='core') + datasetFile = h5py.File(file_to_load, mode='r', driver='core') - data = dataset['/']["data"] - target = dataset['/']["targets"] + data = datasetFile['/']["data"] + target = None + try: + target = datasetFile['/']["targets"] + except: + pass - return Dataset(metadata, data, target) + dset = Dataset(metadata, data, target) + dset._fileHandle = h5pyFileWrapper(datasetFile) + return dset def _save_dataset(dataset, path, filename): """Call to ``h5py`` to write the dataset @@ -249,3 +255,12 @@ def remove(name): """ cfg.remove_dataset(name) + +class h5pyFileWrapper: + """ Used to close handle when a ``Dataset`` is destroyed.""" + + def __init__(self, file): + self.file = file + + def __del__(self): + self.file.close() diff --git a/tests/test_dataset_store.py b/tests/test_dataset_store.py index 0ca1c5f..b2a7aef 100644 --- a/tests/test_dataset_store.py +++ b/tests/test_dataset_store.py @@ -28,8 +28,13 @@ def test_save_load(): "test_dset", (70, 90, 100), 0) + dset_nt = ds.CSV_importer("test.csv", + "test_dset", + (70,90,100)) ds.save(dset, "v1") + ds.save(dset_nt, "noTarget") dset2 = ds.load("test_dset", "v1") + dset_nt2 = ds.load("test_dset", "noTarget") nt.assert_equal(dset.__hash__(), dset2.__hash__()) nt.assert_equal(dset.meta_data.name, dset2.meta_data.name) @@ -58,4 +63,6 @@ def test_save_load(): with nt.assert_raises(LookupError): ds.load("test_dset", "v3") + nt.assert_is_none(dset_nt2.target) + From 0f35b10e0920c387125380992145efa962c0b5c0 Mon Sep 17 00:00:00 2001 From: Adam Salvail Date: Tue, 13 May 2014 13:43:22 -0400 Subject: [PATCH 60/65] Added support for minibatches --- mldata/dataset.py | 24 +++++++++++++++--------- tests/test_dataset.py | 2 +- 2 files changed, 16 insertions(+), 10 deletions(-) diff --git a/mldata/dataset.py b/mldata/dataset.py index a81a72d..4d57515 100644 --- a/mldata/dataset.py +++ b/mldata/dataset.py @@ -94,7 +94,7 @@ def __getitem__(self, key): else: return (self.data[key],) - def _split_iterators(self, start, end): + def _split_iterators(self, start, end, minibatch_size=1): """ Iterate on a split. Parameters @@ -109,16 +109,17 @@ def _split_iterators(self, start, end): if self.target is not None: for idx in range(start, end, buffer): stop = min(idx+buffer, end) - for ex, tg in zip(self.data[idx:stop], - self.target[idx:stop]): - yield (ex,tg) + for i in range(idx, stop, minibatch_size): + j = min(stop, i+minibatch_size) + yield (self.data[i:j], self.target[i:j]) else: for idx in range(start, end, buffer): stop = min(idx+buffer, end) - for ex in self.data[idx:stop]: - yield (ex,) + for i in range(idx, stop, minibatch_size): + j = min(stop, i+minibatch_size) + yield (self.data[i:j],) - def get_splits_iterators(self): + def get_splits_iterators(self, minibatch_size=1): """ Creates a tuple of iterator, each iterating on a split. Each iterators returned is used to iterate over the corresponding @@ -127,6 +128,11 @@ def get_splits_iterators(self): iterator for the ten first examples, another for the ten next and a third for the ten lasts. + Parameters + ---------- + minibatch_size : int + The size of minibatches received each iteration. + Returns ------- tuple of iterable @@ -135,7 +141,7 @@ def get_splits_iterators(self): """ sp = list(self.meta_data.splits) - # normalize the splits< + # normalize the splits if sum(sp) == len(self): sp = list(accumulate(sp)) assert(sp[-1] == len(self), @@ -143,7 +149,7 @@ def get_splits_iterators(self): itors = [] for start, end in zip([0] + sp, sp): - itors.append(self._split_iterators(start, end)) + itors.append(self._split_iterators(start, end, minibatch_size)) return itors def apply(self): diff --git a/tests/test_dataset.py b/tests/test_dataset.py index 30ddcb9..954fb61 100644 --- a/tests/test_dataset.py +++ b/tests/test_dataset.py @@ -82,7 +82,7 @@ def test_get_splits_iterators(self): citer = chain.from_iterable(self.dsetS.get_splits_iterators()) for a, b in zip(citer, self.dsetS): d1 = a[0] - d2 = b[0] + d2 = [b[0]] nt.assert_true(np.array_equal(d1,d2)) sp = self.dsetL.meta_data.splits From 5425285adab3999f36b9a11e6656045a2695a7d5 Mon Sep 17 00:00:00 2001 From: Adam Salvail Date: Sat, 31 May 2014 19:00:29 -0400 Subject: [PATCH 61/65] Changed iterators to cycle infinitely and corrected an assert statement. --- mldata/dataset.py | 71 +++++++++++++++++++++++++---------------------- 1 file changed, 38 insertions(+), 33 deletions(-) diff --git a/mldata/dataset.py b/mldata/dataset.py index 4d57515..8cdf91c 100644 --- a/mldata/dataset.py +++ b/mldata/dataset.py @@ -6,6 +6,7 @@ BUFFER_SIZE = 1000 + class Dataset(): """Interface to interact with physical dataset @@ -30,16 +31,15 @@ class Dataset(): """ def __init__(self, meta_data, data, target=None): - assert(len(data) == meta_data.nb_examples, - "The metadata ``nb_examples`` is inconsistent with the length of " - "the dataset.") - assert(len(data) == meta_data.splits[-1] or - len(data) == sum(meta_data.splits), - "The metadata ``splits`` is inconsistent with the length of the " - "dataset.") + assert len(data) == meta_data.nb_examples,\ + "The metadata ``nb_examples`` is inconsistent with the length of "\ + "the dataset." + assert len(data) == meta_data.splits[-1] or\ + len(data) == sum(meta_data.splits),\ + "The metadata ``splits`` is inconsistent with the length of "\ + "the dataset." self.data = data self.target = target - assert isinstance(meta_data, Metadata) self.meta_data = meta_data def __len__(self): @@ -59,17 +59,20 @@ def __iter__(self): """Provide an iterator handling if the Dataset has a target.""" #todo: retest efficiency of this buffering in python3. With zip being now lazy, it might not be better than the vanilla iter. buffer = min(BUFFER_SIZE, len(self)) - if self.target is not None: - for idx in range(0, len(self.data), buffer): - stop = min(idx + buffer, len(self)) - for ex, tg in zip(self.data[idx:stop], - self.target[idx:stop]): - yield (ex,tg) - else: - for idx in range(0, len(self.data), buffer): - stop = min(idx + buffer, len(self)) - for ex in self.data[idx:stop]: - yield (ex,) + + # Cycle infinitely + while True: + if self.target is not None: + for idx in range(0, len(self.data), buffer): + stop = min(idx + buffer, len(self)) + for ex, tg in zip(self.data[idx:stop], + self.target[idx:stop]): + yield (ex, tg) + else: + for idx in range(0, len(self.data), buffer): + stop = min(idx + buffer, len(self)) + for ex in self.data[idx:stop]: + yield (ex,) def __getitem__(self, key): """Get the entry specified by the key. @@ -106,18 +109,21 @@ def _split_iterators(self, start, end, minibatch_size=1): """ buffer = min(BUFFER_SIZE, end - start) - if self.target is not None: - for idx in range(start, end, buffer): - stop = min(idx+buffer, end) - for i in range(idx, stop, minibatch_size): - j = min(stop, i+minibatch_size) - yield (self.data[i:j], self.target[i:j]) - else: - for idx in range(start, end, buffer): - stop = min(idx+buffer, end) - for i in range(idx, stop, minibatch_size): - j = min(stop, i+minibatch_size) - yield (self.data[i:j],) + + # Cycle infinitely + while True: + if self.target is not None: + for idx in range(start, end, buffer): + stop = min(idx+buffer, end) + for i in range(idx, stop, minibatch_size): + j = min(stop, i+minibatch_size) + yield (self.data[i:j], self.target[i:j].reshape((1, -1))) + else: + for idx in range(start, end, buffer): + stop = min(idx+buffer, end) + for i in range(idx, stop, minibatch_size): + j = min(stop, i+minibatch_size) + yield (self.data[i:j],) def get_splits_iterators(self, minibatch_size=1): """ Creates a tuple of iterator, each iterating on a split. @@ -144,8 +150,7 @@ def get_splits_iterators(self, minibatch_size=1): # normalize the splits if sum(sp) == len(self): sp = list(accumulate(sp)) - assert(sp[-1] == len(self), - "The splits couldn't be normalized") + assert sp[-1] == len(self), "The splits couldn't be normalized" itors = [] for start, end in zip([0] + sp, sp): From 6a16d4316bfca5feb3d12e8c4f3a41c2aeb9d879 Mon Sep 17 00:00:00 2001 From: Adam Salvail Date: Sat, 31 May 2014 19:10:44 -0400 Subject: [PATCH 62/65] Removed infinite cycle in iterator. Promote the use of itertools.cycle(iter) instead. --- mldata/dataset.py | 54 ++++++++++++++++++++++------------------------- 1 file changed, 25 insertions(+), 29 deletions(-) diff --git a/mldata/dataset.py b/mldata/dataset.py index 8cdf91c..a873728 100644 --- a/mldata/dataset.py +++ b/mldata/dataset.py @@ -60,19 +60,17 @@ def __iter__(self): #todo: retest efficiency of this buffering in python3. With zip being now lazy, it might not be better than the vanilla iter. buffer = min(BUFFER_SIZE, len(self)) - # Cycle infinitely - while True: - if self.target is not None: - for idx in range(0, len(self.data), buffer): - stop = min(idx + buffer, len(self)) - for ex, tg in zip(self.data[idx:stop], - self.target[idx:stop]): - yield (ex, tg) - else: - for idx in range(0, len(self.data), buffer): - stop = min(idx + buffer, len(self)) - for ex in self.data[idx:stop]: - yield (ex,) + if self.target is not None: + for idx in range(0, len(self.data), buffer): + stop = min(idx + buffer, len(self)) + for ex, tg in zip(self.data[idx:stop], + self.target[idx:stop]): + yield (ex, tg) + else: + for idx in range(0, len(self.data), buffer): + stop = min(idx + buffer, len(self)) + for ex in self.data[idx:stop]: + yield (ex,) def __getitem__(self, key): """Get the entry specified by the key. @@ -93,9 +91,9 @@ def __getitem__(self, key): """ if self.target is not None: - return (self.data[key], self.target[key]) + return self.data[key], self.target[key] else: - return (self.data[key],) + return self.data[key], def _split_iterators(self, start, end, minibatch_size=1): """ Iterate on a split. @@ -110,20 +108,18 @@ def _split_iterators(self, start, end, minibatch_size=1): """ buffer = min(BUFFER_SIZE, end - start) - # Cycle infinitely - while True: - if self.target is not None: - for idx in range(start, end, buffer): - stop = min(idx+buffer, end) - for i in range(idx, stop, minibatch_size): - j = min(stop, i+minibatch_size) - yield (self.data[i:j], self.target[i:j].reshape((1, -1))) - else: - for idx in range(start, end, buffer): - stop = min(idx+buffer, end) - for i in range(idx, stop, minibatch_size): - j = min(stop, i+minibatch_size) - yield (self.data[i:j],) + if self.target is not None: + for idx in range(start, end, buffer): + stop = min(idx+buffer, end) + for i in range(idx, stop, minibatch_size): + j = min(stop, i+minibatch_size) + yield (self.data[i:j], self.target[i:j].reshape((1, -1))) + else: + for idx in range(start, end, buffer): + stop = min(idx+buffer, end) + for i in range(idx, stop, minibatch_size): + j = min(stop, i+minibatch_size) + yield (self.data[i:j],) def get_splits_iterators(self, minibatch_size=1): """ Creates a tuple of iterator, each iterating on a split. From ed98c62131d5a21318736aa8cf76166b531838cf Mon Sep 17 00:00:00 2001 From: Adam Salvail Date: Sat, 31 May 2014 19:11:34 -0400 Subject: [PATCH 63/65] Changed import names to reflect name change. Some other minor corrections. --- mldata/dataset_store.py | 17 +++++++++++------ tests/test_dataset.py | 16 +++++++++------- tests/test_dataset_store.py | 11 +++++++---- tests/utils/test_config.py | 11 +++++++---- 4 files changed, 34 insertions(+), 21 deletions(-) diff --git a/mldata/dataset_store.py b/mldata/dataset_store.py index cf22858..1b96320 100644 --- a/mldata/dataset_store.py +++ b/mldata/dataset_store.py @@ -6,8 +6,8 @@ import h5py import numpy as np -from mldata.dataset import Dataset, Metadata -import mldata.utils.config as cfg +from SMARTdata.mldata.utils import config as cfg +from SMARTdata.mldata.dataset import Dataset, Metadata def load(dset_name, version_name="baseDataset", lazy=False): @@ -49,6 +49,7 @@ def load(dset_name, version_name="baseDataset", lazy=False): raise LookupError("This dataset does not exist.") return _load_from_file(dset_name + '_' + version_name, path, lazy) + def save(dataset, version_name="baseDataset"): """ Save the dataset, manages versions. @@ -96,6 +97,7 @@ def save(dataset, version_name="baseDataset"): meta_file = dset_name + '_' + version_name + ".meta" _save_metadata(dataset.meta_data, dset_path, meta_file) + def _load_from_file(name, path, lazy): """ Call to ``h5py`` to load the file. @@ -114,17 +116,17 @@ def _load_from_file(name, path, lazy): else: datasetFile = h5py.File(file_to_load, mode='r', driver='core') - data = datasetFile['/']["data"] + data = datasetFile['/']["data"] target = None try: target = datasetFile['/']["targets"] except: pass - dset = Dataset(metadata, data, target) dset._fileHandle = h5pyFileWrapper(datasetFile) return dset + def _save_dataset(dataset, path, filename): """Call to ``h5py`` to write the dataset @@ -145,6 +147,7 @@ def _save_dataset(dataset, path, filename): if dataset.target is not None: f.create_dataset('targets', data=dataset.target) + def _save_metadata(metadata, path, filename): """ Pickle the metadata. @@ -160,6 +163,7 @@ def _save_metadata(metadata, path, filename): with open(os.path.join(path, filename), 'wb') as f: pk.dump(metadata, f, pk.HIGHEST_PROTOCOL) + def CSV_importer(filepath, name, splits, @@ -234,7 +238,7 @@ def CSV_importer(filepath, dset = None if target_column is not None: - targets = data[:, target_column] + targets = data[:, target_column].reshape((1, -1)) examples = data[:, list(range(0,target_column)) + list(range(target_column+1, data.shape[1]))] dset = Dataset(meta, examples, targets) @@ -245,6 +249,7 @@ def CSV_importer(filepath, return dset + def remove(name): """ Remove a dataset from the datasets folder. @@ -256,9 +261,9 @@ def remove(name): """ cfg.remove_dataset(name) + class h5pyFileWrapper: """ Used to close handle when a ``Dataset`` is destroyed.""" - def __init__(self, file): self.file = file diff --git a/tests/test_dataset.py b/tests/test_dataset.py index 954fb61..b4c3a7d 100644 --- a/tests/test_dataset.py +++ b/tests/test_dataset.py @@ -1,18 +1,19 @@ +import copy +from itertools import chain + import numpy as np import nose.tools as nt -import copy -from itertools import chain, accumulate -from mldata.dataset import Dataset, Metadata +from SMARTdata.mldata.dataset import Dataset, Metadata class Dataset_test: @classmethod def setup_class(self): - self.dataSmall = np.random.rand(30, 5) - self.dataLarge = np.random.rand(3000, 5) - self.targetSmall = np.random.rand(30, 1) - self.targetLarge = np.random.rand(3000, 1) + self.dataSmall = np.random.random((30, 5)) + self.dataLarge = np.random.random((3000, 5)) + self.targetSmall = np.random.random((30, 1)) + self.targetLarge = np.random.random((3000, 1)) self.metadataS = Metadata() self.metadataS.splits = (10, 20, 30) @@ -89,6 +90,7 @@ def test_get_splits_iterators(self): for splitn, it in zip(sp, self.dsetL.get_splits_iterators()): nt.assert_equal(sum(1 for _ in it), splitn) + def double_dset(dset): """ Basic preprocessing function. """ return Dataset(dset.meta_data, dset.data * 2, dset.target * 2) diff --git a/tests/test_dataset_store.py b/tests/test_dataset_store.py index b2a7aef..6fec26f 100644 --- a/tests/test_dataset_store.py +++ b/tests/test_dataset_store.py @@ -3,25 +3,28 @@ import numpy as np import nose.tools as nt -import mldata.dataset_store as ds +import SMARTdata.mldata.dataset_store as ds -RND_MATRIX = np.random.rand(100,10) +RND_MATRIX = np.random.random((100, 10)) def setup_module(): np.savetxt("test.csv", RND_MATRIX) + def teardown_module(): os.remove("test.csv") ds.remove("test_dset") + def test_CSV_importer(): dset = ds.CSV_importer("test.csv", "test_dset", (70, 90, 100), 0) - nt.assert_true(np.array_equal(RND_MATRIX[:,1:], dset.data)) + nt.assert_true(np.array_equal(RND_MATRIX[:, 1:], dset.data)) + def test_save_load(): dset = ds.CSV_importer("test.csv", @@ -30,7 +33,7 @@ def test_save_load(): 0) dset_nt = ds.CSV_importer("test.csv", "test_dset", - (70,90,100)) + (70, 90, 100)) ds.save(dset, "v1") ds.save(dset_nt, "noTarget") dset2 = ds.load("test_dset", "v1") diff --git a/tests/utils/test_config.py b/tests/utils/test_config.py index 480c0d9..f8fffa2 100644 --- a/tests/utils/test_config.py +++ b/tests/utils/test_config.py @@ -1,18 +1,20 @@ import os -import configparser as cp import nose.tools as nt -import mldata.utils.config as cfg +from SMARTdata.mldata.utils import config as cfg + def setup_module(): # save current config file if os.path.isfile(cfg.CONFIGFILE): - os.rename(cfg.CONFIGFILE, cfg.CONFIGFILE +".bak") + os.rename(cfg.CONFIGFILE, cfg.CONFIGFILE + ".bak") + def teardown_module(): # restore config file - os.rename(cfg.CONFIGFILE +".bak", cfg.CONFIGFILE) + os.rename(cfg.CONFIGFILE + ".bak", cfg.CONFIGFILE) + def test_load_config(): cf = cfg._load_config() @@ -22,6 +24,7 @@ def test_load_config(): nt.assert_equal(path, cfg._load_path()) nt.assert_true(cf.has_section('datasets')) + def test_add_remove(): cfg.add_dataset("test_dataset") nt.assert_true(cfg.dataset_exists("test_dataset")) From 6ccca017607d819d7fdceead123b358d95236e52 Mon Sep 17 00:00:00 2001 From: Adam Salvail Date: Mon, 7 Jul 2014 11:17:21 -0400 Subject: [PATCH 64/65] Added a convenient method the whole dataset in memory. --- mldata/dataset.py | 46 +++++++++++++++++++++++++++++++++++++--------- 1 file changed, 37 insertions(+), 9 deletions(-) diff --git a/mldata/dataset.py b/mldata/dataset.py index a873728..42cfef2 100644 --- a/mldata/dataset.py +++ b/mldata/dataset.py @@ -141,18 +141,34 @@ def get_splits_iterators(self, minibatch_size=1): A tuple of iterator, one for each split. """ - sp = list(self.meta_data.splits) + sp = self._normalize_splits() - # normalize the splits - if sum(sp) == len(self): - sp = list(accumulate(sp)) - assert sp[-1] == len(self), "The splits couldn't be normalized" - - itors = [] - for start, end in zip([0] + sp, sp): - itors.append(self._split_iterators(start, end, minibatch_size)) + itors = [self._split_iterators(start, end, minibatch_size) for + (start, end) in zip([0] + sp, sp)] return itors + def get_splits(self): + """ Get the datasets arrays. + + WARNING : This method will try to load the entire dataset in memory. + + Returns + ------- + tuple of tuple of array + The data and targets sliced in multiple subarrays. + ``((data1, target1), (data2, target2), ...)`` + + """ + sp = self._normalize_splits() + indices = zip([0]+sp, sp) + + if self.target is not None: + return tuple((self.data[slice(*s)], self.target[slice(*s)]) + for s in indices) + else: + return tuple((self.data[slice(*s)],) for s in indices) + + def apply(self): """Apply the preprocess specified in the associated metadata. @@ -171,6 +187,16 @@ def apply(self): assert isinstance(ds, Dataset) return ds + def _normalize_splits(self): + sp = list(self.meta_data.splits) + + # normalize the splits + if sum(sp) == len(self): + sp = list(accumulate(sp)) + assert sp[-1] == len(self), "The splits couldn't be normalized" + + return sp + class Metadata(): """Keep track of information about a dataset. @@ -214,9 +240,11 @@ def __init__(self): self.preprocess = default_preprocess self.hash = "" + def default_preprocess(dset): return dset + class Dictionary: """Word / integer association list From ada95215c52a8f915c1504d29cc1aed4e47896ac Mon Sep 17 00:00:00 2001 From: Adam Salvail Date: Mon, 7 Jul 2014 11:19:22 -0400 Subject: [PATCH 65/65] Targets reshaped to fit adequate data structure. --- mldata/dataset_store.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mldata/dataset_store.py b/mldata/dataset_store.py index 1b96320..e6abfa5 100644 --- a/mldata/dataset_store.py +++ b/mldata/dataset_store.py @@ -238,8 +238,8 @@ def CSV_importer(filepath, dset = None if target_column is not None: - targets = data[:, target_column].reshape((1, -1)) - examples = data[:, list(range(0,target_column)) + + targets = data[:, target_column].reshape((-1, 1)) + examples = data[:, list(range(0, target_column)) + list(range(target_column+1, data.shape[1]))] dset = Dataset(meta, examples, targets) else: