Bug fixed

daviddiazvico · Sep 9, 2019 · ec5ffb6 · ec5ffb6
1 parent 9466137
commit ec5ffb6
Show file tree

Hide file tree

Showing 16 changed files with 598 additions and 803 deletions.
diff --git a/.travis.yml b/.travis.yml
@@ -16,7 +16,7 @@ script:
   - python setup.py test
   - mkdir docs
   - export PYTHONPATH=`pwd`
-  - sphinx-quickstart -q -p scikit-datasets -a "David Diaz Vico" -v 0.1 -r 0.1.30 -l en --ext-autodoc --ext-viewcode --ext-githubpages --extensions sphinxcontrib.napoleon --no-makefile --no-batchfile docs
+  - sphinx-quickstart -q -p scikit-datasets -a "David Diaz Vico" -v 0.1 -r 0.1.31 -l en --ext-autodoc --ext-viewcode --ext-githubpages --extensions sphinxcontrib.napoleon --no-makefile --no-batchfile docs
   - sphinx-apidoc -o docs/_static/ skdatasets -F -a -l
   - travis-sphinx -v build -s docs -n
 after_success:

diff --git a/README.md b/README.md
@@ -1,41 +1,42 @@
-# scikit-datasets
-Scikit-learn-compatible datasets
-
-## Status
-[![Build Status](https://travis-ci.com/daviddiazvico/scikit-datasets.svg?branch=master)](https://travis-ci.com/daviddiazvico/scikit-datasets)
-[![Maintainability](https://api.codeclimate.com/v1/badges/a37c9ee152b41a0cb577/maintainability)](https://codeclimate.com/github/daviddiazvico/scikit-datasets/maintainability)
-[![Test Coverage](https://api.codeclimate.com/v1/badges/a37c9ee152b41a0cb577/test_coverage)](https://codeclimate.com/github/daviddiazvico/scikit-datasets/test_coverage)
-
-## Installation
-Available in [PyPI](https://pypi.python.org/pypi?:action=display&name=scikit-datasets)
-```
-pip install scikit-datasets
-```
-
-## Documentation
-Autogenerated and hosted in [GitHub Pages](https://daviddiazvico.github.io/scikit-datasets/)
-
-## Distribution
-Run the following command from the project home to create the distribution
-```
-python setup.py sdist bdist_wheel
-```
-and upload the package to [testPyPI](https://testpypi.python.org/)
-```
-twine upload --repository-url https://test.pypi.org/legacy/ dist/*
-```
-or [PyPI](https://pypi.python.org/)
-```
-twine upload dist/*
-```
-
-## Citation
-If you find scikit-datasets useful, please cite it in your publications. You can use this [BibTeX](http://www.bibtex.org/) entry:
-```
-@misc{scikit-datasets,
-      title={scikit-datasets},
-      author={Diaz-Vico, David},
-      year={2017},
-      publisher={GitHub},
-      howpublished={\url{https://github.com/daviddiazvico/scikit-datasets}}}
+# scikit-datasets
+Scikit-learn-compatible datasets
+
+## Status
+[![Build Status](https://travis-ci.com/daviddiazvico/scikit-datasets.svg?branch=master)](https://travis-ci.com/daviddiazvico/scikit-datasets)
+[![Maintainability](https://api.codeclimate.com/v1/badges/a37c9ee152b41a0cb577/maintainability)](https://codeclimate.com/github/daviddiazvico/scikit-datasets/maintainability)
+[![Test Coverage](https://api.codeclimate.com/v1/badges/a37c9ee152b41a0cb577/test_coverage)](https://codeclimate.com/github/daviddiazvico/scikit-datasets/test_coverage)
+[![Build Status](https://dev.azure.com/daviddiazvico0337/daviddiazvico/_apis/build/status/daviddiazvico.scikit-datasets?branchName=master)](https://dev.azure.com/daviddiazvico0337/daviddiazvico/_build/latest?definitionId=1&branchName=master)
+
+## Installation
+Available in [PyPI](https://pypi.python.org/pypi?:action=display&name=scikit-datasets)
+```
+pip install scikit-datasets
+```
+
+## Documentation
+Autogenerated and hosted in [GitHub Pages](https://daviddiazvico.github.io/scikit-datasets/)
+
+## Distribution
+Run the following command from the project home to create the distribution
+```
+python setup.py sdist bdist_wheel
+```
+and upload the package to [testPyPI](https://testpypi.python.org/)
+```
+twine upload --repository-url https://test.pypi.org/legacy/ dist/*
+```
+or [PyPI](https://pypi.python.org/)
+```
+twine upload dist/*
+```
+
+## Citation
+If you find scikit-datasets useful, please cite it in your publications. You can use this [BibTeX](http://www.bibtex.org/) entry:
+```
+@misc{scikit-datasets,
+      title={scikit-datasets},
+      author={Diaz-Vico, David},
+      year={2017},
+      publisher={GitHub},
+      howpublished={\url{https://github.com/daviddiazvico/scikit-datasets}}}
 ```
diff --git a/azure-pipelines.yml b/azure-pipelines.yml
@@ -0,0 +1,31 @@
+# Python package
+# Create and test a Python package on multiple Python versions.
+# Add steps that analyze code, save the dist with the build record, publish to a PyPI-compatible index, and more:
+# https://docs.microsoft.com/azure/devops/pipelines/languages/python
+
+trigger:
+- master
+
+pool:
+  vmImage: 'ubuntu-latest'
+strategy:
+  matrix:
+    Python36:
+      python.version: '3.6'
+    Python37:
+      python.version: '3.7'
+
+steps:
+- task: UsePythonVersion@0
+  inputs:
+    versionSpec: '$(python.version)'
+  displayName: 'Use Python $(python.version)'
+
+- script: |
+    python -m pip install --upgrade pip
+  displayName: 'Install dependencies'
+
+- script: |
+    pip install pytest-azurepipelines
+    python setup.py test
+  displayName: 'Test'
diff --git a/setup.py b/setup.py
@@ -9,30 +9,28 @@
 
 setup(name='scikit-datasets',
       packages=find_packages(),
-      version='0.1.30',
+      version='0.1.31',
       description='Scikit-learn-compatible datasets',
 #      long_description=open('README.md', 'r').read(),
       author='David Diaz Vico',
       author_email='[email protected]',
       url='https://github.com/daviddiazvico/scikit-datasets',
-      download_url='https://github.com/daviddiazvico/scikit-datasets/archive/v0.1.30.tar.gz',
+      download_url='https://github.com/daviddiazvico/scikit-datasets/archive/v0.1.31.tar.gz',
       keywords=['scikit-learn'],
       classifiers=['Intended Audience :: Science/Research',
                    'Topic :: Scientific/Engineering',
                    'Programming Language :: Python',
                    'Programming Language :: Python :: 3',
-                   'Programming Language :: Python :: 3.6'],
-      install_requires=['scikit-learn'],
+                   'Programming Language :: Python :: 3.7'],
+      install_requires=['numpy', 'scipy', 'scikit-learn'],
       extras_require={'cran':  ['rdata'],
                       'forex': ['forex_python'],
                       'keel': ['pandas'],
                       'keras': ['keras'],
-                      'utils.estimator': ['jsonpickle'],
-                      'utils.experiments': ['sacred'],
-                      'utils.scores': ['pandas', 'scipy', 'statsmodels'],
-                      'utils.validation': ['seaborn']},
+                      'utils.estimator': ['jsonpickle==0.9.6'],
+                      'utils.experiments': ['sacred']},
       setup_requires=['pytest-runner'],
-      tests_require=['coverage', 'forex_python', 'jsonpickle', 'keras',
-                     'pandas', 'pytest', 'pytest-cov', 'rdata', 'sacred',
-                     'scipy', 'seaborn', 'statsmodels', 'tensorflow'],
+      tests_require=['coverage', 'forex_python', 'jsonpickle==0.9.6', 'keras',
+                     'pandas', 'pymongo', 'pytest', 'pytest-cov', 'rdata',
+                     'sacred', 'tensorflow'],
       test_suite='tests')
diff --git a/skdatasets/repositories/__init__.py b/skdatasets/repositories/__init__.py
@@ -29,8 +29,8 @@
 
 
 def fetch(repository, dataset, collection=None, **kwargs):
-    try:
+    if collection:
         data = repos[repository].fetch(collection, dataset, **kwargs)
-    except:
+    else:
         data = repos[repository].fetch(dataset, **kwargs)
     return data
diff --git a/skdatasets/utils/experiment.py b/skdatasets/utils/experiment.py
@@ -4,11 +4,9 @@
 """
 
 import numpy as np
-import os
 from sacred import Experiment, Ingredient
 from sklearn.model_selection import cross_validate, PredefinedSplit
-
-from skdatasets.utils.validation import scatter_plot, metaparameter_plot, history_plot
+from tempfile import TemporaryFile
 
 
 def experiment(dataset, estimator):
@@ -41,38 +39,24 @@ def experiment(dataset, estimator):
     experiment = Experiment(ingredients=(_dataset, _estimator))
 
     @experiment.automain
-    def run(cross_validate=cross_validate, return_estimator=False):
+    def run(return_estimator=False, save_output=False):
         """Run the experiment.
 
         Run the experiment.
 
         Parameters
         ----------
-        cross_validate : function, default=cross_validate
-            Function to evaluate metrics by cross-validation. Must receive the
-            estimator, X, y (migth be None) and cv (migth be None). Must return
-            a dictionary with the cross-validation score and maybe other info,
-            like a list of fitted estimators.
         return_estimator : boolean, default False
             Whether to return the estimator or estimators fitted.
+        save_output : boolean, default False
+            Whether to save the output as an artifact.
 
         """
         data = dataset()
         for a in ('target', 'data_test', 'target_test', 'inner_cv', 'outer_cv'):
             if a not in data:
                 setattr(data, a, None)
 
-        def _explicit_folds(data):
-            """Prepare a dataset where the CV folds are explicit."""
-            X = np.array([]).reshape((0, *data.inner_cv[0][0].shape[1:]))
-            y = np.array([]).reshape((0, *data.inner_cv[0][1].shape[1:]))
-            cv = []
-            for i, (X_, y_, X_test_, y_test_) in enumerate(data.inner_cv):
-                X = np.concatenate((X, X_, X_test_))
-                y = np.concatenate((y, y_, y_test_))
-                cv = cv + [-1]*len(X_) + [i]*len(X_test_)
-            return X, y, cv
-
         def _estimator(cv=None):
             """Create an estimator with or without hyperparameter search."""
             try:
@@ -81,46 +65,43 @@ def _estimator(cv=None):
                 e = estimator()
             return e
 
-        def _plots(e, i, X, y):
-            """Create different descriptive plots."""
-            # Metaparameter plots
-            image_files = metaparameter_plot(e, image_file=f'metaparameter_{i}.pdf')
-            for image_file in image_files:
-                experiment.add_artifact(image_file)
-                print("Removing " + image_file)
-                os.remove(image_file)
-            # Scatter plots
-            image_files = scatter_plot(X, y, e, image_file=f'scatter_{i}.pdf')
-            for image_file in image_files:
-                experiment.add_artifact(image_file)
-                print("Removing " + image_file)
-                os.remove(image_file)
+        def _output(e, X):
+            """Generate the outputs of an estimator."""
+            outputs = dict()
+            for output in ('transform', 'predict'):
+                if hasattr(e, output):
+                    outputs[output] = getattr(e, output)(X)
+            return outputs
 
         # Inner CV for metaparameter search
-        if hasattr(data.inner_cv, '__iter__'):
-            # Explicit CV folds
-            X, y, cv = _explicit_folds(data)
+        if hasattr(data.inner_cv, '__iter__'):  # Explicit CV folds
+            X = np.array([]).reshape((0, *data.inner_cv[0][0].shape[1:]))
+            y = np.array([]).reshape((0, *data.inner_cv[0][1].shape[1:]))
+            cv = []
+            for i, (X_, y_, X_test_, y_test_) in enumerate(data.inner_cv):
+                X = np.concatenate((X, X_, X_test_))
+                y = np.concatenate((y, y_, y_test_))
+                cv = cv + [-1]*len(X_) + [i]*len(X_test_)
             e = _estimator(cv=PredefinedSplit(cv))
             e.fit(X, y=y)
             if hasattr(e, 'best_estimator_'):
                 e.fit = e.best_estimator_.fit
-        else:
-            # Automatic/indexed CV folds
+        else:  # Automatic/indexed CV folds
             e = _estimator(cv=data.inner_cv)
 
         # Outer CV/test partition for model assessment
-        if data.data_test is not None:
-            # Test partition
+        if data.data_test is not None:  # Test partition
             e.fit(data.data, y=data.target)
             scores = {'test_score': [e.score(data.data_test,
                                              y=data.target_test)]}
             if return_estimator:
                 scores['estimator'] = [e]
-            _plots(e, 0, data.data_test, data.target_test)
-        else:
-            # Outer CV
-            if hasattr(data.outer_cv, '__iter__'):
-                # Explicit CV folds
+            if save_output:
+                with TemporaryFile() as tmpfile:
+                    np.save(tmpfile, _output(e, data.data_test))
+                    experiment.add_artifact(tmpfile, name='output.npy')
+        else:  # Outer CV
+            if hasattr(data.outer_cv, '__iter__'):  # Explicit CV folds
                 scores = {'test_score': []}
                 if return_estimator:
                     scores['estimator'] = []
@@ -129,16 +110,15 @@ def _plots(e, i, X, y):
                     scores['test_score'].append(e.score(X_test, y=y_test))
                     if return_estimator:
                         scores['estimator'].append(e)
-                    _plots(e, i, X_test, y_test)
-            else:
-                # Automatic/indexed CV folds
+                    if save_output:
+                        with TemporaryFile() as tmpfile:
+                            np.save(tmpfile, _output(e, X_test))
+                            experiment.add_artifact(tmpfile,
+                                                    name=f'output_{i}.npy')
+            else:  # Automatic/indexed CV folds
                 scores = cross_validate(e, data.data, y=data.target,
                                         cv=data.outer_cv,
-                                        return_estimator=True)
-                for i, e in enumerate(scores['estimator']):
-                    _plots(e, i, data.data, data.target)
-                if not return_estimator:
-                    scores.pop('estimator')
+                                        return_estimator=return_estimator)
         experiment.log_scalar('score_mean', np.nanmean(scores['test_score']))
         experiment.log_scalar('score_std', np.nanstd(scores['test_score']))
         experiment.info.update(scores)