diff --git a/.github/workflows/code-climate.yml b/.github/workflows/codeclimate.yml similarity index 90% rename from .github/workflows/code-climate.yml rename to .github/workflows/codeclimate.yml index 8be0eec..9baec17 100644 --- a/.github/workflows/code-climate.yml +++ b/.github/workflows/codeclimate.yml @@ -1,19 +1,16 @@ name: CodeClimate upload - on: workflow_run: workflows: [Tests] types: - completed - jobs: download: runs-on: ubuntu-latest steps: - - uses: actions/checkout@v2 - + - uses: actions/checkout@v3 - name: 'Download artifact' - uses: actions/github-script@v5 + uses: actions/github-script@v6 with: script: | let allArtifacts = await github.rest.actions.listWorkflowRunArtifacts({ @@ -32,15 +29,12 @@ jobs: }); let fs = require('fs'); fs.writeFileSync(`${process.env.GITHUB_WORKSPACE}/code-coverage-report.zip`, Buffer.from(download.data)); - - name: 'Unzip artifact' run: unzip code-coverage-report.zip - - name: Install dependencies run: | pip3 install codecov pytest-cov || pip3 install --user codecov pytest-cov; - - name: Upload coverage to CodeClimate - uses: paambaati/codeclimate-action@v3.0.0 + uses: paambaati/codeclimate-action@v3.2.0 env: CC_TEST_REPORTER_ID: ${{ secrets.CC_TEST_REPORTER_ID }} \ No newline at end of file diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml index 29d0720..9d4d0fe 100644 --- a/.github/workflows/docs.yml +++ b/.github/workflows/docs.yml @@ -3,12 +3,14 @@ on: push: branches: - master + - develop + pull_request: jobs: build: runs-on: ubuntu-latest steps: - - uses: actions/setup-python@v2 - - uses: actions/checkout@master + - uses: actions/setup-python@v4 + - uses: actions/checkout@v3 with: fetch-depth: 0 # otherwise, you will failed to push refs to dest repo - name: Install package diff --git a/.github/workflows/python-publish.yml b/.github/workflows/pypi.yml similarity index 84% rename from .github/workflows/python-publish.yml rename to .github/workflows/pypi.yml index ec70354..bc13962 100644 --- a/.github/workflows/python-publish.yml +++ b/.github/workflows/pypi.yml @@ -1,29 +1,22 @@ # This workflow will upload a Python Package using Twine when a release is created # For more information see: https://help.github.com/en/actions/language-and-framework-guides/using-python-with-github-actions#publishing-to-package-registries - # This workflow uses actions that are not certified by GitHub. # They are provided by a third-party and are governed by # separate terms of service, privacy policy, and support # documentation. - name: Upload Python Package - on: release: types: [published] - permissions: contents: read - jobs: deploy: - runs-on: ubuntu-latest - steps: - uses: actions/checkout@v3 - name: Set up Python - uses: actions/setup-python@v3 + uses: actions/setup-python@v4 with: python-version: '3.x' - name: Install dependencies @@ -33,7 +26,7 @@ jobs: - name: Build package run: python -m build - name: Publish package - uses: pypa/gh-action-pypi-publish@27b31702a0e7fc50959f5ad993c78deac1bdfc29 + uses: pypa/gh-action-pypi-publish@release/v1 with: user: __token__ - password: ${{ secrets.PYPI_API_TOKEN }} + password: ${{ secrets.PYPI_API_TOKEN }} \ No newline at end of file diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 3559f97..02e9a08 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -1,9 +1,7 @@ name: Tests - on: push: pull_request: - jobs: build: runs-on: ${{ matrix.os }} @@ -11,29 +9,23 @@ jobs: strategy: matrix: os: [ubuntu-latest, macos-latest, windows-latest] - python-version: ['3.8', '3.9'] - + python-version: ['3.8', '3.9', '3.10'] steps: - - uses: actions/checkout@v2 - + - uses: actions/checkout@v3 - name: Set up Python ${{ matrix.python-version }} on ${{ matrix.os }} - uses: actions/setup-python@v2 + uses: actions/setup-python@v4 with: python-version: ${{ matrix.python-version }} - - name: Install dependencies run: | pip3 install codecov pytest-cov || pip3 install --user codecov pytest-cov; - - name: Run tests run: | - #pip3 debug --verbose . pip3 install --upgrade-strategy eager -v ".[test]" coverage run --source=skdatasets/ -m pytest; coverage xml -o coverage.xml # explicitely exporting coverage file to be read by coverage report command. - - name: Archive code coverage results uses: actions/upload-artifact@v3 with: name: code-coverage-report - path: coverage.xml + path: coverage.xml \ No newline at end of file diff --git a/README.md b/README.md index acde3d2..96e53cc 100644 --- a/README.md +++ b/README.md @@ -5,7 +5,6 @@ Scikit-learn-compatible datasets [![Tests](https://github.com/daviddiazvico/scikit-datasets/actions/workflows/tests.yml/badge.svg)](https://github.com/daviddiazvico/scikit-datasets/actions/workflows/tests.yml) [![Maintainability](https://api.codeclimate.com/v1/badges/a37c9ee152b41a0cb577/maintainability)](https://codeclimate.com/github/daviddiazvico/scikit-datasets/maintainability) [![Test Coverage](https://api.codeclimate.com/v1/badges/a37c9ee152b41a0cb577/test_coverage)](https://codeclimate.com/github/daviddiazvico/scikit-datasets/test_coverage) -[![Build Status](https://dev.azure.com/daviddiazvico0337/daviddiazvico/_apis/build/status/daviddiazvico.scikit-datasets?branchName=master)](https://dev.azure.com/daviddiazvico0337/daviddiazvico/_build/latest?definitionId=1&branchName=master) [![DOI](https://zenodo.org/badge/DOI/10.5281/zenodo.6383047.svg)](https://doi.org/10.5281/zenodo.6383047) ## Installation diff --git a/azure-pipelines.yml b/azure-pipelines.yml deleted file mode 100644 index b6ec8b4..0000000 --- a/azure-pipelines.yml +++ /dev/null @@ -1,30 +0,0 @@ -# Python package -# Create and test a Python package on multiple Python versions. -# Add steps that analyze code, save the dist with the build record, publish to a PyPI-compatible index, and more: -# https://docs.microsoft.com/azure/devops/pipelines/languages/python - -trigger: -- master - -pool: - vmImage: 'ubuntu-latest' -strategy: - matrix: - Python38: - python.version: '3.8' - -steps: -- task: UsePythonVersion@0 - inputs: - versionSpec: '$(python.version)' - displayName: 'Use Python $(python.version)' - -- script: | - python -m pip install --upgrade pip - pip install ".[test]" - displayName: 'Install dependencies' - -- script: | - pip install pytest-azurepipelines - pytest - displayName: 'Test' diff --git a/conftest.py b/conftest.py index 5c82d79..ba397d2 100644 --- a/conftest.py +++ b/conftest.py @@ -1,19 +1,19 @@ -import pytest - -collect_ignore = ['setup.py'] - - -def pytest_addoption(parser): - parser.addoption( - "--runslow", action="store_true", default=False, help="run slow tests" - ) - - -def pytest_collection_modifyitems(config, items): - if config.getoption("--runslow"): - # --runslow given in cli: do not skip slow tests - return - skip_slow = pytest.mark.skip(reason="need --runslow option to run") - for item in items: - if "slow" in item.keywords: - item.add_marker(skip_slow) +import pytest + +collect_ignore = ["setup.py"] + + +def pytest_addoption(parser): + parser.addoption( + "--runslow", action="store_true", default=False, help="run slow tests" + ) + + +def pytest_collection_modifyitems(config, items): + if config.getoption("--runslow"): + # --runslow given in cli: do not skip slow tests + return + skip_slow = pytest.mark.skip(reason="need --runslow option to run") + for item in items: + if "slow" in item.keywords: + item.add_marker(skip_slow) diff --git a/docs/conf.py b/docs/conf.py index b958beb..f8d9ab1 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -1,95 +1,97 @@ -# Configuration file for the Sphinx documentation builder. -# -# This file only contains a selection of the most common options. For a full -# list see the documentation: -# https://www.sphinx-doc.org/en/master/usage/configuration.html - -# -- Path setup -------------------------------------------------------------- - -# If extensions (or modules to document with autodoc) are in another directory, -# add these directories to sys.path here. If the directory is relative to the -# documentation root, use os.path.abspath to make it absolute, like shown here. -# -# import os -# import sys -# sys.path.insert(0, os.path.abspath('.')) - - -# -- Project information ----------------------------------------------------- - -import sys - -import pkg_resources - -try: - release = pkg_resources.get_distribution('scikit-datasets').version -except pkg_resources.DistributionNotFound: - print('To build the documentation, The distribution information of\n' - 'scikit-datasets has to be available. Either install the package\n' - 'into your development environment or run "setup.py develop"\n' - 'to setup the metadata. A virtualenv is recommended!\n') - sys.exit(1) -del pkg_resources - -version = '.'.join(release.split('.')[:2]) - -project = 'scikit-datasets' -copyright = '2020, David Diaz Vico' -author = 'David Diaz Vico' - - -# -- General configuration --------------------------------------------------- - -# Add any Sphinx extension module names here, as strings. They can be -# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom -# ones. -extensions = [ - 'sphinx.ext.autodoc', - 'sphinx.ext.autosummary', - 'sphinx.ext.viewcode', - 'sphinx.ext.githubpages', - 'sphinx.ext.napoleon', - 'sphinx.ext.intersphinx', -] - -# Add any paths that contain templates here, relative to this directory. -templates_path = ['_templates'] - -# The language for content autogenerated by Sphinx. Refer to documentation -# for a list of supported languages. -# -# This is also used if you do content translation via gettext catalogs. -# Usually you set "language" from the command line for these cases. -language = 'en' - -# List of patterns, relative to source directory, that match files and -# directories to ignore when looking for source files. -# This pattern also affects html_static_path and html_extra_path. -exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store'] - - -# -- Options for HTML output ------------------------------------------------- - -# The theme to use for HTML and HTML Help pages. See the documentation for -# a list of builtin themes. -# -html_theme = 'alabaster' - -# Add any paths that contain custom static files (such as style sheets) here, -# relative to this directory. They are copied after the builtin static files, -# so a file named "default.css" will overwrite the builtin "default.css". -html_static_path = ['_static'] - - -# -- Extension configuration ------------------------------------------------- -intersphinx_mapping = { - 'python': ( - 'https://docs.python.org/{.major}'.format(sys.version_info), - None, - ), - 'numpy': ('https://docs.scipy.org/doc/numpy/', None), - 'scipy': ('https://docs.scipy.org/doc/scipy/reference', None), - 'sklearn': ('https://scikit-learn.org/stable', None), - 'pandas': ('https://pandas.pydata.org/pandas-docs/stable/', None), - 'sacred': ('https://sacred.readthedocs.io/en/stable/', None), -} +# Configuration file for the Sphinx documentation builder. +# +# This file only contains a selection of the most common options. For a full +# list see the documentation: +# https://www.sphinx-doc.org/en/master/usage/configuration.html + +# -- Path setup -------------------------------------------------------------- + +# If extensions (or modules to document with autodoc) are in another directory, +# add these directories to sys.path here. If the directory is relative to the +# documentation root, use os.path.abspath to make it absolute, like shown here. +# +# import os +# import sys +# sys.path.insert(0, os.path.abspath('.')) + + +# -- Project information ----------------------------------------------------- + +import sys + +import pkg_resources + +try: + release = pkg_resources.get_distribution("scikit-datasets").version +except pkg_resources.DistributionNotFound: + print( + "To build the documentation, The distribution information of\n" + "scikit-datasets has to be available. Either install the package\n" + 'into your development environment or run "setup.py develop"\n' + "to setup the metadata. A virtualenv is recommended!\n" + ) + sys.exit(1) +del pkg_resources + +version = ".".join(release.split(".")[:2]) + +project = "scikit-datasets" +copyright = "2020, David Diaz Vico" +author = "David Diaz Vico" + + +# -- General configuration --------------------------------------------------- + +# Add any Sphinx extension module names here, as strings. They can be +# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom +# ones. +extensions = [ + "sphinx.ext.autodoc", + "sphinx.ext.autosummary", + "sphinx.ext.viewcode", + "sphinx.ext.githubpages", + "sphinx.ext.napoleon", + "sphinx.ext.intersphinx", +] + +# Add any paths that contain templates here, relative to this directory. +templates_path = ["_templates"] + +# The language for content autogenerated by Sphinx. Refer to documentation +# for a list of supported languages. +# +# This is also used if you do content translation via gettext catalogs. +# Usually you set "language" from the command line for these cases. +language = "en" + +# List of patterns, relative to source directory, that match files and +# directories to ignore when looking for source files. +# This pattern also affects html_static_path and html_extra_path. +exclude_patterns = ["_build", "Thumbs.db", ".DS_Store"] + + +# -- Options for HTML output ------------------------------------------------- + +# The theme to use for HTML and HTML Help pages. See the documentation for +# a list of builtin themes. +# +html_theme = "alabaster" + +# Add any paths that contain custom static files (such as style sheets) here, +# relative to this directory. They are copied after the builtin static files, +# so a file named "default.css" will overwrite the builtin "default.css". +html_static_path = ["_static"] + + +# -- Extension configuration ------------------------------------------------- +intersphinx_mapping = { + "python": ( + "https://docs.python.org/{.major}".format(sys.version_info), + None, + ), + "numpy": ("https://docs.scipy.org/doc/numpy/", None), + "scipy": ("https://docs.scipy.org/doc/scipy/reference", None), + "sklearn": ("https://scikit-learn.org/stable", None), + "pandas": ("https://pandas.pydata.org/pandas-docs/stable/", None), + "sacred": ("https://sacred.readthedocs.io/en/stable/", None), +} diff --git a/pyproject.toml b/pyproject.toml index ac6cd58..dbe0d65 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "scikit-datasets" -version = "0.2.2" +version = "0.2.3" description = "Scikit-learn-compatible datasets" readme = "README.md" requires-python = ">=3.8" @@ -21,14 +21,9 @@ classifiers = [ "Programming Language :: Python :: 3", "Programming Language :: Python :: 3.8", "Programming Language :: Python :: 3.9", + "Programming Language :: Python :: 3.10", ] - -dependencies = [ - "numpy", - "scipy", - "scikit-learn", -] - +dependencies = ["numpy", "scipy", "scikit-learn"] [project.optional-dependencies] cran = ["rdata"] forex = ["forex_python>=1.6"] @@ -38,20 +33,13 @@ physionet = ["pandas", "wfdb"] utils-estimator = ["jsonpickle"] utils-experiments = ["sacred", "incense"] utils-scores = ["statsmodels", "jinja2"] -all = ["scikit-datasets[cran,forex,keel,keras,physionet,utils-estimator,utils-experiments,utils-scores]"] -test = [ - "pytest", - "pytest-cov[all]", - "coverage", - "scikit-datasets[all]", -] - +all = ["scikit-datasets[cran, forex, keel, keras, physionet, utils-estimator, utils-experiments, utils-scores]"] +test = ["pytest", "pytest-cov[all]", "coverage", "scikit-datasets[all]"] [project.urls] homepage = "https://github.com/daviddiazvico/scikit-datasets" documentation = "https://daviddiazvico.github.io/scikit-datasets/" repository = "https://github.com/daviddiazvico/scikit-datasets" -download = "https://github.com/daviddiazvico/scikit-datasets/archive/v0.2.1.tar.gz" - +download = "https://github.com/daviddiazvico/scikit-datasets/archive/v0.2.2.tar.gz" [build-system] # Minimum requirements for the build system to execute. -requires = ["setuptools", "wheel"] # PEP 508 specifications. \ No newline at end of file +requires = ["setuptools", "wheel"] # PEP 508 specifications. diff --git a/skdatasets/repositories/__init__.py b/skdatasets/repositories/__init__.py index 7b3f8a4..d7a7604 100644 --- a/skdatasets/repositories/__init__.py +++ b/skdatasets/repositories/__init__.py @@ -5,31 +5,42 @@ from . import aneurisk, libsvm, raetsch, sklearn, uci, ucr -repos = {'libsvm': libsvm, 'raetsch': raetsch, 'sklearn': sklearn, 'uci': uci, - 'ucr': ucr, 'aneurisk': aneurisk} +repos = { + "libsvm": libsvm, + "raetsch": raetsch, + "sklearn": sklearn, + "uci": uci, + "ucr": ucr, + "aneurisk": aneurisk, +} try: from . import cran - repos['cran'] = cran + + repos["cran"] = cran except ImportError: pass try: from . import forex - repos['forex'] = forex + + repos["forex"] = forex except ImportError: pass try: from . import keel - repos['keel'] = keel + + repos["keel"] = keel except ImportError: pass try: from . import keras - repos['keras'] = keras + + repos["keras"] = keras except ImportError: pass try: from . import physionet - repos['physionet'] = physionet + + repos["physionet"] = physionet except ImportError: pass diff --git a/skdatasets/repositories/aneurisk.py b/skdatasets/repositories/aneurisk.py index bda40dd..c872343 100644 --- a/skdatasets/repositories/aneurisk.py +++ b/skdatasets/repositories/aneurisk.py @@ -26,7 +26,9 @@ def fetch(name="Aneurisk65", *, data_home=None, return_X_y=False): n_samples = 65 - url = "http://ecm2.mathcs.emory.edu/aneuriskdata/files/Carotid-data_MBI_workshop.zip" + url = ( + "http://ecm2.mathcs.emory.edu/aneuriskdata/files/Carotid-data_MBI_workshop.zip" + ) dataset_path = fetch_zip( dataname=name, @@ -36,32 +38,32 @@ def fetch(name="Aneurisk65", *, data_home=None, return_X_y=False): ) patient_dtype = [ - ('patient', np.int_), - ('code', 'U8'), - ('type', 'U1'), - ('aneurysm location', np.float_), - ('left_right', 'U2'), + ("patient", np.int_), + ("code", "U8"), + ("type", "U1"), + ("aneurysm location", np.float_), + ("left_right", "U2"), ] functions_dtype = [ - ('curvilinear abscissa', np.object_), - ('MISR', np.object_), - ('X0 observed', np.object_), - ('Y0 observed', np.object_), - ('Z0 observed', np.object_), - ('X0 observed FKS', np.object_), - ('Y0 observed FKS', np.object_), - ('Z0 observed FKS', np.object_), - ('X0 observed FKS reflected', np.object_), - ('X1 observed FKS', np.object_), - ('Y1 observed FKS', np.object_), - ('Z1 observed FKS', np.object_), - ('X1 observed FKS reflected', np.object_), - ('X2 observed FKS', np.object_), - ('Y2 observed FKS', np.object_), - ('Z2 observed FKS', np.object_), - ('X2 observed FKS reflected', np.object_), - ('Curvature FKS', np.object_), + ("curvilinear abscissa", np.object_), + ("MISR", np.object_), + ("X0 observed", np.object_), + ("Y0 observed", np.object_), + ("Z0 observed", np.object_), + ("X0 observed FKS", np.object_), + ("Y0 observed FKS", np.object_), + ("Z0 observed FKS", np.object_), + ("X0 observed FKS reflected", np.object_), + ("X1 observed FKS", np.object_), + ("Y1 observed FKS", np.object_), + ("Z1 observed FKS", np.object_), + ("X1 observed FKS reflected", np.object_), + ("X2 observed FKS", np.object_), + ("Y2 observed FKS", np.object_), + ("Z2 observed FKS", np.object_), + ("X2 observed FKS reflected", np.object_), + ("Curvature FKS", np.object_), ] complete_dtype = patient_dtype + functions_dtype @@ -69,10 +71,10 @@ def fetch(name="Aneurisk65", *, data_home=None, return_X_y=False): X = np.zeros(shape=n_samples, dtype=complete_dtype) X[[p[0] for p in patient_dtype]] = np.genfromtxt( - dataset_path / 'Patients.txt', + dataset_path / "Patients.txt", dtype=patient_dtype, skip_header=1, - missing_values=('NA',), + missing_values=("NA",), ) for i in range(n_samples): diff --git a/skdatasets/repositories/base.py b/skdatasets/repositories/base.py index d2d071f..03b81df 100644 --- a/skdatasets/repositories/base.py +++ b/skdatasets/repositories/base.py @@ -95,7 +95,7 @@ def fetch_file( # store file try: - with open(filename, 'w+b') as data_file: + with open(filename, "w+b") as data_file: copyfileobj(data_url, data_file) except Exception: filename.unlink() @@ -130,16 +130,14 @@ def _missing_files( members_zip = compressed_file.infolist() return [ - info for info in members_zip + info + for info in members_zip if not (data_home_path / info.filename).exists() ] members_tar = compressed_file.getmembers() - return [ - info for info in members_tar - if not (data_home_path / info.name).exists() - ] + return [info for info in members_tar if not (data_home_path / info.name).exists()] def fetch_compressed( @@ -148,7 +146,7 @@ def fetch_compressed( compression_open: OpenMethod, subfolder: Optional[str] = None, data_home: Optional[str] = None, - open_format: str = 'r', + open_format: str = "r", ) -> pathlib.Path: """Fetch compressed dataset. @@ -268,7 +266,7 @@ def fetch_tgz( compression_open=tarfile.open, subfolder=subfolder, data_home=data_home, - open_format='r:gz', + open_format="r:gz", ) @@ -358,26 +356,17 @@ def dataset_from_dataframe( ): data_dataframe = ( - frame - if target_column is None - else frame.drop(target_column, axis=1) - ) - target_dataframe = ( - None - if target_column is None - else frame.loc[:, target_column] + frame if target_column is None else frame.drop(target_column, axis=1) ) + target_dataframe = None if target_column is None else frame.loc[:, target_column] - data = ( - data_dataframe - if as_frame is True - else data_dataframe.to_numpy() - ) + data = data_dataframe if as_frame is True else data_dataframe.to_numpy() target = ( None if target_dataframe is None - else target_dataframe if as_frame is True + else target_dataframe + if as_frame is True else target_dataframe.to_numpy() ) @@ -385,11 +374,7 @@ def dataset_from_dataframe( return data, target feature_names = list(data_dataframe.keys()) - target_names = ( - None - if target_dataframe is None - else list(target_dataframe.keys()) - ) + target_names = None if target_dataframe is None else list(target_dataframe.keys()) bunch = Bunch( data=data, diff --git a/skdatasets/repositories/cran.py b/skdatasets/repositories/cran.py index 927b297..20a2c46 100644 --- a/skdatasets/repositories/cran.py +++ b/skdatasets/repositories/cran.py @@ -46,7 +46,7 @@ def __init__(self, *, convert_charrefs: bool = True) -> None: self.last_is_version = False self.version: str | None = None - self.version_regex = re.compile('(?i).*version.*') + self.version_regex = re.compile("(?i).*version.*") self.handling_td = False def handle_starttag( @@ -78,7 +78,7 @@ def _get_latest_version_online(package_name: str, dataset_name: str) -> str: ) try: with urllib.request.urlopen(url_request) as url_file: - url_content = url_file.read().decode('utf-8') + url_content = url_file.read().decode("utf-8") except urllib.error.HTTPError as e: if e.code == 404: raise DatasetNotFoundError(f"{package_name}/{dataset_name}") from e @@ -105,7 +105,7 @@ def _get_latest_version_offline(package_name: str) -> str | None: if downloaded_packages: versions = [ - LooseVersion(p.name[(len(package_name) + 1):-len(".tar.gz")]) + LooseVersion(p.name[(len(package_name) + 1) : -len(".tar.gz")]) for p in downloaded_packages ] @@ -153,15 +153,12 @@ def _get_urls( version: str | None = None, ) -> Sequence[str]: - version = _get_version( - package_name, dataset_name=dataset_name, version=version) + version = _get_version(package_name, dataset_name=dataset_name, version=version) filename = f"{package_name}_{version}.tar.gz" latest_url = f"{CRAN_URL}/src/contrib/{filename}" - archive_url = ( - f"{CRAN_URL}/src/contrib/Archive/{package_name}/{filename}" - ) + archive_url = f"{CRAN_URL}/src/contrib/Archive/{package_name}/{filename}" return (latest_url, archive_url) @@ -191,7 +188,7 @@ def _download_package_data( for i, url in enumerate(url_list): try: - directory = _fetch_tgz(folder_name, url, subfolder='cran') + directory = _fetch_tgz(folder_name, url, subfolder="cran") break except Exception: # If it is the last url, reraise @@ -266,8 +263,7 @@ def fetch_dataset( possible_names = list(data_path.glob(dataset_name + ".*")) if len(possible_names) != 1: raise FileNotFoundError( - f"Dataset {dataset_name} not found in " - f"package {package_name}", + f"Dataset {dataset_name} not found in " f"package {package_name}", ) file_path = data_path / possible_names[0] @@ -341,7 +337,7 @@ def fetch_package( for dataset in data_path.iterdir(): - if dataset.suffix.lower() in ['.rda', '.rdata']: + if dataset.suffix.lower() in [".rda", ".rdata"]: try: parsed = rdata.parser.parse_file(dataset) @@ -366,9 +362,9 @@ class _DatasetArguments(TypedDict): datasets: Mapping[str, _DatasetArguments] = { - 'geyser': { - 'load_args': (['geyser.rda', 'MASS'], {}), - 'sklearn_args': ([], {'target_name': 'waiting'}), + "geyser": { + "load_args": (["geyser.rda", "MASS"], {}), + "sklearn_args": ([], {"target_name": "waiting"}), }, } @@ -447,10 +443,10 @@ def fetch( (data, target) : tuple if ``return_X_y`` is True """ - load_args = datasets[name]['load_args'] + load_args = datasets[name]["load_args"] dataset = fetch_dataset(*load_args[0], **load_args[1]) - sklearn_args = datasets[name]['sklearn_args'] + sklearn_args = datasets[name]["sklearn_args"] sklearn_dataset = _to_sklearn(dataset, *sklearn_args[0], **sklearn_args[1]) if return_X_y: diff --git a/skdatasets/repositories/forex.py b/skdatasets/repositories/forex.py index 8e0a3f3..4f7315f 100644 --- a/skdatasets/repositories/forex.py +++ b/skdatasets/repositories/forex.py @@ -26,7 +26,7 @@ def _fetch(get_rate, start=date(2015, 1, 1), end=date.today()): return np.asarray(data).reshape((-1, 1)) -def _load_bitcoin(start=date(2015, 1, 1), end=date.today(), currency='EUR'): +def _load_bitcoin(start=date(2015, 1, 1), end=date.today(), currency="EUR"): """Load bitcoin dataset""" btcc = BtcConverter() @@ -36,8 +36,9 @@ def get_rate(day): return _fetch(get_rate, start=start, end=end) -def _load_forex(start=date(2015, 1, 1), end=date.today(), currency_1='USD', - currency_2='EUR'): +def _load_forex( + start=date(2015, 1, 1), end=date.today(), currency_1="USD", currency_2="EUR" +): """Load forex dataset.""" cr = CurrencyRates() @@ -48,8 +49,13 @@ def get_rate(day): return _fetch(get_rate, start=start, end=end) -def fetch(start=date(2015, 1, 1), end=date.today(), currency_1='USD', - currency_2='EUR', return_X_y=False): +def fetch( + start=date(2015, 1, 1), + end=date.today(), + currency_1="USD", + currency_2="EUR", + return_X_y=False, +): """Fetch Forex datasets. Fetches the ECB Forex and Coindesk Bitcoin datasets. More info at @@ -76,17 +82,18 @@ def fetch(start=date(2015, 1, 1), end=date.today(), currency_1='USD', (data, target) : tuple if ``return_X_y`` is True """ - if currency_1 == 'BTC': + if currency_1 == "BTC": X = _load_bitcoin(start=start, end=end, currency=currency_2) - descr = 'BTC-' + str(currency_2) - elif currency_2 == 'BTC': + descr = "BTC-" + str(currency_2) + elif currency_2 == "BTC": X = _load_bitcoin(start=start, end=end, currency=currency_1) - descr = 'BTC-' + str(currency_1) + descr = "BTC-" + str(currency_1) else: - X = _load_forex(start=start, end=end, currency_1=currency_1, - currency_2=currency_2) - descr = str(currency_1) + '-' + str(currency_2) - descr = descr + start.strftime('%Y-%m-%d') + '-' + end.strftime('%Y-%m-%d') + X = _load_forex( + start=start, end=end, currency_1=currency_1, currency_2=currency_2 + ) + descr = str(currency_1) + "-" + str(currency_2) + descr = descr + start.strftime("%Y-%m-%d") + "-" + end.strftime("%Y-%m-%d") if return_X_y: return X, None diff --git a/skdatasets/repositories/keel.py b/skdatasets/repositories/keel.py index 84a2fd5..e288fcd 100644 --- a/skdatasets/repositories/keel.py +++ b/skdatasets/repositories/keel.py @@ -8,7 +8,6 @@ import io import os -import sys from pathlib import Path from types import MappingProxyType from typing import ( @@ -31,44 +30,46 @@ from .base import fetch_file -BASE_URL = 'http://sci2s.ugr.es/keel' -COLLECTIONS: Final = frozenset(( - 'classification', - 'missing', - 'imbalanced', - 'multiInstance', - 'multilabel', - 'textClassification', - 'classNoise', - 'attributeNoise', - 'semisupervised', - 'regression', - 'timeseries', - 'unsupervised', - 'lowQuality', -)) +BASE_URL = "http://sci2s.ugr.es/keel" +COLLECTIONS: Final = frozenset( + ( + "classification", + "missing", + "imbalanced", + "multiInstance", + "multilabel", + "textClassification", + "classNoise", + "attributeNoise", + "semisupervised", + "regression", + "timeseries", + "unsupervised", + "lowQuality", + ) +) # WTFs IMBALANCED_URLS: Final = ( - 'keel-dataset/datasets/imbalanced/imb_IRhigherThan9', - 'keel-dataset/datasets/imbalanced/imb_IRhigherThan9p1', - 'keel-dataset/datasets/imbalanced/imb_IRhigherThan9p2', - 'keel-dataset/datasets/imbalanced/imb_IRhigherThan9p3', - 'dataset/data/imbalanced', - 'keel-dataset/datasets/imbalanced/imb_noisyBordExamples', - 'keel-dataset/datasets/imbalanced/preprocessed', + "keel-dataset/datasets/imbalanced/imb_IRhigherThan9", + "keel-dataset/datasets/imbalanced/imb_IRhigherThan9p1", + "keel-dataset/datasets/imbalanced/imb_IRhigherThan9p2", + "keel-dataset/datasets/imbalanced/imb_IRhigherThan9p3", + "dataset/data/imbalanced", + "keel-dataset/datasets/imbalanced/imb_noisyBordExamples", + "keel-dataset/datasets/imbalanced/preprocessed", ) IRREGULAR_DESCR_IMBALANCED_URLS: Final = ( - 'keel-dataset/datasets/imbalanced/imb_IRhigherThan9', - 'keel-dataset/datasets/imbalanced/imb_IRhigherThan9p1', - 'keel-dataset/datasets/imbalanced/imb_IRhigherThan9p2', - 'keel-dataset/datasets/imbalanced/imb_IRhigherThan9p3', + "keel-dataset/datasets/imbalanced/imb_IRhigherThan9", + "keel-dataset/datasets/imbalanced/imb_IRhigherThan9p1", + "keel-dataset/datasets/imbalanced/imb_IRhigherThan9p2", + "keel-dataset/datasets/imbalanced/imb_IRhigherThan9p3", ) INCORRECT_DESCR_IMBALANCED_URLS: Final = MappingProxyType( - {'semisupervised': 'classification'}, + {"semisupervised": "classification"}, ) @@ -87,22 +88,26 @@ def __init__( self.Xs_test = Xs_test self.ys_test = ys_test - def __iter__(self) -> Iterator[Tuple[ - np.typing.NDArray[float], - np.typing.NDArray[Union[int, float]], - np.typing.NDArray[float], - np.typing.NDArray[Union[int, float]], - ]]: + def __iter__( + self, + ) -> Iterator[ + Tuple[ + np.typing.NDArray[float], + np.typing.NDArray[Union[int, float]], + np.typing.NDArray[float], + np.typing.NDArray[Union[int, float]], + ] + ]: return zip(self.Xs, self.ys, self.Xs_test, self.ys_test) def _load_Xy( zipfile: Path, csvfile: str, - sep: str = ',', + sep: str = ",", header: Optional[int] = None, - engine: str = 'python', - na_values: AbstractSet[str] = frozenset(('?')), + engine: str = "python", + na_values: AbstractSet[str] = frozenset(("?")), **kwargs: Any, ) -> Tuple[np.typing.NDArray[float], np.typing.NDArray[Union[int, float]]]: """Load a zipped csv file with target in the last column.""" @@ -129,14 +134,14 @@ def _load_descr( data_home: Optional[str] = None, ) -> Tuple[int, str]: """Load a dataset description.""" - subfolder = os.path.join('keel', collection) - filename = name + '-names.txt' - if collection == 'imbalanced': + subfolder = os.path.join("keel", collection) + filename = name + "-names.txt" + if collection == "imbalanced": for url in IMBALANCED_URLS: if url in IRREGULAR_DESCR_IMBALANCED_URLS: - url = BASE_URL + '/' + url + '/' + 'names' + '/' + filename + url = BASE_URL + "/" + url + "/" + "names" + "/" + filename else: - url = BASE_URL + '/' + url + '/' + filename + url = BASE_URL + "/" + url + "/" + filename try: f = fetch_file( dataname=name, @@ -173,10 +178,10 @@ def _fetch_keel_zip( data_home: Optional[str] = None, ) -> Path: """Fetch Keel dataset zip file.""" - subfolder = os.path.join('keel', collection) - if collection == 'imbalanced': + subfolder = os.path.join("keel", collection) + if collection == "imbalanced": for url in IMBALANCED_URLS: - url = BASE_URL + '/' + url + '/' + filename + url = BASE_URL + "/" + url + "/" + filename try: return fetch_file( dataname=name, @@ -210,13 +215,13 @@ def _load_folds( Optional[KeelOuterCV], ]: """Load a dataset folds.""" - filename = name + '.zip' + filename = name + ".zip" f = _fetch_keel_zip(collection, name, filename, data_home=data_home) - X, y = _load_Xy(f, name + '.dat', skiprows=nattrs + 4) + X, y = _load_Xy(f, name + ".dat", skiprows=nattrs + 4) cv = None if nfolds in (5, 10): - fold = 'dobscv' if dobscv else 'fold' - filename = name + '-' + str(nfolds) + '-' + fold + '.zip' + fold = "dobscv" if dobscv else "fold" + filename = name + "-" + str(nfolds) + "-" + fold + ".zip" f = _fetch_keel_zip(collection, name, filename, data_home=data_home) Xs = [] ys = [] @@ -228,11 +233,10 @@ def _load_folds( _name = f"{name}/{name}-{nfolds}dobscv-{i + 1}" else: _name = f"{name}-{nfolds}-{i + 1}" - X_fold, y_fold = _load_Xy( - f, _name + 'tra.dat', skiprows=nattrs + 4) + X_fold, y_fold = _load_Xy(f, _name + "tra.dat", skiprows=nattrs + 4) X_test_fold, y_test_fold = _load_Xy( f, - _name + 'tst.dat', + _name + "tst.dat", skiprows=nattrs + 4, ) Xs.append(X_fold) @@ -317,7 +321,7 @@ def fetch( """ if collection not in COLLECTIONS: - raise ValueError('Avaliable collections are ' + str(list(COLLECTIONS))) + raise ValueError("Avaliable collections are " + str(list(COLLECTIONS))) nattrs, DESCR = _load_descr(collection, name, data_home=data_home) X, y, cv = _load_folds( collection, diff --git a/skdatasets/repositories/keras.py b/skdatasets/repositories/keras.py index 2fc32bb..5914736 100644 --- a/skdatasets/repositories/keras.py +++ b/skdatasets/repositories/keras.py @@ -23,13 +23,13 @@ ) DATASETS = { - 'boston_housing': boston_housing.load_data, - 'cifar10': cifar10.load_data, - 'cifar100': cifar100.load_data, - 'fashion_mnist': fashion_mnist.load_data, - 'imdb': imdb.load_data, - 'mnist': mnist.load_data, - 'reuters': reuters.load_data, + "boston_housing": boston_housing.load_data, + "cifar10": cifar10.load_data, + "cifar100": cifar100.load_data, + "fashion_mnist": fashion_mnist.load_data, + "imdb": imdb.load_data, + "mnist": mnist.load_data, + "reuters": reuters.load_data, } @@ -83,7 +83,7 @@ def fetch( """ (X_train, y_train), (X_test, y_test) = DATASETS[name](**kwargs) if len(X_train.shape) > 2: - name = name + ' ' + str(X_train.shape[1:]) + ' shaped' + name = name + " " + str(X_train.shape[1:]) + " shaped" X_max = np.iinfo(X_train[0][0].dtype).max n_features = np.prod(X_train.shape[1:]) X_train = X_train.reshape([X_train.shape[0], n_features]) / X_max diff --git a/skdatasets/repositories/libsvm.py b/skdatasets/repositories/libsvm.py index 9169657..c46c1b8 100644 --- a/skdatasets/repositories/libsvm.py +++ b/skdatasets/repositories/libsvm.py @@ -7,7 +7,6 @@ from __future__ import annotations import os -import sys from typing import Final, Literal, Sequence, Tuple, overload import numpy as np @@ -18,13 +17,15 @@ from .base import DatasetNotFoundError, fetch_file -BASE_URL: Final = 'https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets' -COLLECTIONS: Final = frozenset(( - 'binary', - 'multiclass', - 'regression', - 'string', -)) +BASE_URL: Final = "https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets" +COLLECTIONS: Final = frozenset( + ( + "binary", + "multiclass", + "regression", + "string", + ) +) def _fetch_partition( @@ -34,8 +35,8 @@ def _fetch_partition( data_home: str | None = None, ) -> str | None: """Fetch dataset partition.""" - subfolder = os.path.join('libsvm', collection) - dataname = name.replace('/', '-') + subfolder = os.path.join("libsvm", collection) + dataname = name.replace("/", "-") url = f"{BASE_URL}/{collection}/{name}{partition}" @@ -68,20 +69,26 @@ def _load( PredefinedSplit, ]: """Load dataset.""" - filename = _fetch_partition(collection, name, '', data_home) - filename_tr = _fetch_partition(collection, name, '.tr', data_home) - filename_val = _fetch_partition(collection, name, '.val', data_home) - filename_t = _fetch_partition(collection, name, '.t', data_home) - filename_r = _fetch_partition(collection, name, '.r', data_home) - - if (filename_tr is not None) and (filename_val is not None) and (filename_t is not None): - - _, _, X_tr, y_tr, X_val, y_val, X_test, y_test = load_svmlight_files([ - filename, - filename_tr, - filename_val, - filename_t, - ]) + filename = _fetch_partition(collection, name, "", data_home) + filename_tr = _fetch_partition(collection, name, ".tr", data_home) + filename_val = _fetch_partition(collection, name, ".val", data_home) + filename_t = _fetch_partition(collection, name, ".t", data_home) + filename_r = _fetch_partition(collection, name, ".r", data_home) + + if ( + (filename_tr is not None) + and (filename_val is not None) + and (filename_t is not None) + ): + + _, _, X_tr, y_tr, X_val, y_val, X_test, y_test = load_svmlight_files( + [ + filename, + filename_tr, + filename_val, + filename_t, + ] + ) cv = PredefinedSplit([-1] * X_tr.shape[0] + [0] * X_val.shape[0]) @@ -90,19 +97,23 @@ def _load( # Compute indices train_indices = list(range(X_tr.shape[0])) - validation_indices = list(range( - X_tr.shape[0], - X_tr.shape[0] + X_val.shape[0], - )) + validation_indices = list( + range( + X_tr.shape[0], + X_tr.shape[0] + X_val.shape[0], + ) + ) test_indices = list(range(X_tr.shape[0] + X_val.shape[0], X.shape[0])) elif (filename_tr is not None) and (filename_val is not None): - _, _, X_tr, y_tr, X_val, y_val = load_svmlight_files([ - filename, - filename_tr, - filename_val, - ]) + _, _, X_tr, y_tr, X_val, y_val = load_svmlight_files( + [ + filename, + filename_tr, + filename_val, + ] + ) cv = PredefinedSplit([-1] * X_tr.shape[0] + [0] * X_val.shape[0]) @@ -116,12 +127,12 @@ def _load( elif (filename_t is not None) and (filename_r is not None): - X_tr, y_tr, X_test, y_test, X_remaining, y_remaining = ( - load_svmlight_files([ + X_tr, y_tr, X_test, y_test, X_remaining, y_remaining = load_svmlight_files( + [ filename, filename_t, filename_r, - ]) + ] ) X = sp.sparse.vstack((X_tr, X_test, X_remaining)) @@ -132,7 +143,8 @@ def _load( validation_indices = [] test_indices = list( range( - X_tr.shape[0], X_tr.shape[0] + X_test.shape[0], + X_tr.shape[0], + X_tr.shape[0] + X_test.shape[0], ), ) @@ -140,10 +152,12 @@ def _load( elif filename_t is not None: - X_tr, y_tr, X_test, y_test = load_svmlight_files([ - filename, - filename_t, - ]) + X_tr, y_tr, X_test, y_test = load_svmlight_files( + [ + filename, + filename_t, + ] + ) X = sp.sparse.vstack((X_tr, X_test)) y = np.hstack((y_tr, y_test)) @@ -225,7 +239,7 @@ def fetch( """ if collection not in COLLECTIONS: - raise Exception('Avaliable collections are ' + str(list(COLLECTIONS))) + raise Exception("Avaliable collections are " + str(list(COLLECTIONS))) X, y, train_indices, validation_indices, test_indices, cv = _load( collection, diff --git a/skdatasets/repositories/physionet.py b/skdatasets/repositories/physionet.py index e62f0c2..491d035 100644 --- a/skdatasets/repositories/physionet.py +++ b/skdatasets/repositories/physionet.py @@ -29,7 +29,7 @@ BASE_URL: Final = "https://physionet.org/static/published-projects" INFO_STRING_SEMICOLONS_ONE_STR: Final = r"(\S*): (\S*)\s*" INFO_STRING_SEMICOLONS_SEVERAL_REGEX: Final = re.compile( - fr"(?:{INFO_STRING_SEMICOLONS_ONE_STR})+", + rf"(?:{INFO_STRING_SEMICOLONS_ONE_STR})+", ) INFO_STRING_SEMICOLONS_ONE_REGEX: Final = re.compile( INFO_STRING_SEMICOLONS_ONE_STR, @@ -62,7 +62,7 @@ def _get_zip_name_online(dataset_name: str) -> str: url_request = urllib.request.Request(url=f"{BASE_URL}/{dataset_name}") try: with urllib.request.urlopen(url_request) as url_file: - url_content = url_file.read().decode('utf-8') + url_content = url_file.read().decode("utf-8") except urllib.error.HTTPError as e: if e.code == 404: raise DatasetNotFoundError(dataset_name) from e @@ -102,8 +102,8 @@ def _get_info_strings(comments: Sequence[str]) -> Mapping[str, Any]: key = result.group(1) if key[0] == "<" and key[-1] == ">": key = key[1:-1] - info_strings_semicolons[key] = ( - _parse_info_string_value(result.group(2)) + info_strings_semicolons[key] = _parse_info_string_value( + result.group(2) ) else: split = comment.rsplit(maxsplit=1) @@ -115,9 +115,8 @@ def _get_info_strings(comments: Sequence[str]) -> Mapping[str, Any]: return info_strings_semicolons # Check for absurd things in spaces - if ( - len(info_strings_spaces) == 1 - or any(key.count(" ") > 3 for key in info_strings_spaces) + if len(info_strings_spaces) == 1 or any( + key.count(" ") > 3 for key in info_strings_spaces ): return {} @@ -244,17 +243,14 @@ def fetch( with open(path / "RECORDS") as records_file: records = [ - wfdb.io.rdrecord(str(path / record_name.rstrip('\n'))) + wfdb.io.rdrecord(str(path / record_name.rstrip("\n"))) for record_name in records_file ] info_strings = [_get_info_strings(r.comments) for r in records] info = _join_info_dicts(info_strings) - assert all( - _constant_attrs(r) == _constant_attrs(records[0]) - for r in records - ) + assert all(_constant_attrs(r) == _constant_attrs(records[0]) for r in records) data = { "signal": [r.p_signal for r in records], } diff --git a/skdatasets/repositories/raetsch.py b/skdatasets/repositories/raetsch.py index e783529..0a6d354 100644 --- a/skdatasets/repositories/raetsch.py +++ b/skdatasets/repositories/raetsch.py @@ -8,7 +8,6 @@ from __future__ import annotations import hashlib -import sys from pathlib import Path from typing import ( Final, @@ -27,21 +26,23 @@ from .base import fetch_file -DATASETS: Final = frozenset(( - 'banana', - 'breast_cancer', - 'diabetis', - 'flare_solar', - 'german', - 'heart', - 'image', - 'ringnorm', - 'splice', - 'thyroid', - 'titanic', - 'twonorm', - 'waveform', -)) +DATASETS: Final = frozenset( + ( + "banana", + "breast_cancer", + "diabetis", + "flare_solar", + "german", + "heart", + "image", + "ringnorm", + "splice", + "thyroid", + "titanic", + "twonorm", + "waveform", + ) +) class RaetschOuterCV(object): @@ -59,12 +60,16 @@ def __init__( self.train_splits = train_splits self.test_splits = test_splits - def __iter__(self) -> Iterator[Tuple[ - np.typing.NDArray[float], - np.typing.NDArray[Union[int, float]], - np.typing.NDArray[float], - np.typing.NDArray[Union[int, float]], - ]]: + def __iter__( + self, + ) -> Iterator[ + Tuple[ + np.typing.NDArray[float], + np.typing.NDArray[Union[int, float]], + np.typing.NDArray[float], + np.typing.NDArray[Union[int, float]], + ] + ]: return ( (self.X[tr - 1], self.y[tr - 1], self.X[ts - 1], self.y[ts - 1]) for tr, ts in zip(self.train_splits, self.test_splits) @@ -89,9 +94,9 @@ def _fetch_remote(data_home: Optional[str] = None) -> Path: Full path of the created file. """ file_path = fetch_file( - 'raetsch', - 'https://github.com/tdiethe/gunnar_raetsch_benchmark_datasets' - '/raw/master/benchmarks.mat', + "raetsch", + "https://github.com/tdiethe/gunnar_raetsch_benchmark_datasets" + "/raw/master/benchmarks.mat", data_home=data_home, ) sha256hash = hashlib.sha256() @@ -102,9 +107,7 @@ def _fetch_remote(data_home: Optional[str] = None) -> Path: break sha256hash.update(buffer) checksum = sha256hash.hexdigest() - remote_checksum = ( - '47c19e4bc4716edc4077cfa5ea61edf4d02af4ec51a0ecfe035626ae8b561c75' - ) + remote_checksum = "47c19e4bc4716edc4077cfa5ea61edf4d02af4ec51a0ecfe035626ae8b561c75" if remote_checksum != checksum: raise IOError( f"{file_path} has an SHA256 checksum ({checksum}) differing " @@ -169,7 +172,7 @@ def fetch( """ if name not in DATASETS: - raise Exception('Avaliable datasets are ' + str(list(DATASETS))) + raise Exception("Avaliable datasets are " + str(list(DATASETS))) filename = _fetch_remote(data_home=data_home) X, y, train_splits, test_splits = loadmat(filename)[name][0][0] if len(y.shape) == 2 and y.shape[1] == 1: diff --git a/skdatasets/repositories/sklearn.py b/skdatasets/repositories/sklearn.py index 4a2e2c3..39fa22d 100644 --- a/skdatasets/repositories/sklearn.py +++ b/skdatasets/repositories/sklearn.py @@ -5,46 +5,81 @@ @license: MIT """ -from sklearn.datasets import (fetch_20newsgroups, - fetch_20newsgroups_vectorized, - fetch_california_housing, fetch_covtype, - fetch_kddcup99, fetch_lfw_pairs, - fetch_lfw_people, fetch_olivetti_faces, - fetch_rcv1, load_breast_cancer, load_diabetes, - load_digits, load_iris, load_linnerud, load_wine, - make_biclusters, make_blobs, make_checkerboard, - make_circles, make_classification, - make_friedman1, make_friedman2, make_friedman3, - make_gaussian_quantiles, make_hastie_10_2, - make_low_rank_matrix, make_moons, - make_multilabel_classification, make_regression, - make_s_curve, make_sparse_coded_signal, - make_sparse_spd_matrix, make_sparse_uncorrelated, - make_spd_matrix, make_swiss_roll) +from sklearn.datasets import ( + fetch_20newsgroups, + fetch_20newsgroups_vectorized, + fetch_california_housing, + fetch_covtype, + fetch_kddcup99, + fetch_lfw_pairs, + fetch_lfw_people, + fetch_olivetti_faces, + fetch_rcv1, + load_breast_cancer, + load_diabetes, + load_digits, + load_iris, + load_linnerud, + load_wine, + make_biclusters, + make_blobs, + make_checkerboard, + make_circles, + make_classification, + make_friedman1, + make_friedman2, + make_friedman3, + make_gaussian_quantiles, + make_hastie_10_2, + make_low_rank_matrix, + make_moons, + make_multilabel_classification, + make_regression, + make_s_curve, + make_sparse_coded_signal, + make_sparse_spd_matrix, + make_sparse_uncorrelated, + make_spd_matrix, + make_swiss_roll, +) -DATASETS = {'20newsgroups': fetch_20newsgroups, - '20newsgroups_vectorized': fetch_20newsgroups_vectorized, - 'biclusters': make_biclusters, 'blobs': make_blobs, - 'breast_cancer': load_breast_cancer, - 'california_housing': fetch_california_housing, - 'checkerboard': make_checkerboard, 'circles': make_circles, - 'classification': make_classification, 'covtype': fetch_covtype, - 'diabetes': load_diabetes, 'digits': load_digits, - 'friedman1': make_friedman1, 'friedman2': make_friedman2, - 'friedman3': make_friedman3, - 'gaussian_quantiles': make_gaussian_quantiles, - 'hastie_10_2': make_hastie_10_2, 'iris': load_iris, - 'kddcup99': fetch_kddcup99, 'lfw_people': fetch_lfw_people, - 'lfw_pairs': fetch_lfw_pairs, 'linnerud': load_linnerud, - 'low_rank_matrix': make_low_rank_matrix, 'moons': make_moons, - 'multilabel_classification': make_multilabel_classification, - 'olivetti_faces': fetch_olivetti_faces, 'rcv1': fetch_rcv1, - 'regression': make_regression, 's_curve': make_s_curve, - 'sparse_coded_signal': make_sparse_coded_signal, - 'sparse_spd_matrix': make_sparse_spd_matrix, - 'sparse_uncorrelated': make_sparse_uncorrelated, - 'spd_matrix': make_spd_matrix, 'swiss_roll': make_swiss_roll, - 'wine': load_wine} +DATASETS = { + "20newsgroups": fetch_20newsgroups, + "20newsgroups_vectorized": fetch_20newsgroups_vectorized, + "biclusters": make_biclusters, + "blobs": make_blobs, + "breast_cancer": load_breast_cancer, + "california_housing": fetch_california_housing, + "checkerboard": make_checkerboard, + "circles": make_circles, + "classification": make_classification, + "covtype": fetch_covtype, + "diabetes": load_diabetes, + "digits": load_digits, + "friedman1": make_friedman1, + "friedman2": make_friedman2, + "friedman3": make_friedman3, + "gaussian_quantiles": make_gaussian_quantiles, + "hastie_10_2": make_hastie_10_2, + "iris": load_iris, + "kddcup99": fetch_kddcup99, + "lfw_people": fetch_lfw_people, + "lfw_pairs": fetch_lfw_pairs, + "linnerud": load_linnerud, + "low_rank_matrix": make_low_rank_matrix, + "moons": make_moons, + "multilabel_classification": make_multilabel_classification, + "olivetti_faces": fetch_olivetti_faces, + "rcv1": fetch_rcv1, + "regression": make_regression, + "s_curve": make_s_curve, + "sparse_coded_signal": make_sparse_coded_signal, + "sparse_spd_matrix": make_sparse_spd_matrix, + "sparse_uncorrelated": make_sparse_uncorrelated, + "spd_matrix": make_spd_matrix, + "swiss_roll": make_swiss_roll, + "wine": load_wine, +} def fetch(name, *, return_X_y=False, **kwargs): diff --git a/skdatasets/repositories/uci.py b/skdatasets/repositories/uci.py index 25f732e..d701dda 100644 --- a/skdatasets/repositories/uci.py +++ b/skdatasets/repositories/uci.py @@ -6,7 +6,6 @@ """ from __future__ import annotations -import sys from pathlib import Path from typing import Any, Literal, Optional, Tuple, Union, overload @@ -16,21 +15,18 @@ from .base import fetch_file -BASE_URL = 'https://archive.ics.uci.edu/ml/machine-learning-databases' +BASE_URL = "https://archive.ics.uci.edu/ml/machine-learning-databases" def _load_csv( fname: Path, **kwargs: Any, -) -> Tuple[ - np.typing.NDArray[float], - np.typing.NDArray[Union[float, int, str]], -]: +) -> Tuple[np.typing.NDArray[float], np.typing.NDArray[Union[float, int, str]],]: """Load a csv with targets in the last column and features in the rest.""" data = np.genfromtxt( fname, dtype=str, - delimiter=',', + delimiter=",", encoding=None, **kwargs, ) @@ -57,9 +53,9 @@ def _fetch( np.typing.NDArray[str], ]: """Fetch dataset.""" - subfolder = 'uci' - filename_str = name + '.data' - url = BASE_URL + '/' + name + '/' + filename_str + subfolder = "uci" + filename_str = name + ".data" + url = BASE_URL + "/" + name + "/" + filename_str filename = fetch_file( dataname=name, @@ -74,8 +70,8 @@ def _fetch( y = ordinal_encoder.fit_transform(y.reshape(-1, 1))[:, 0] target_names = ordinal_encoder.categories_[0] try: - filename_str = name + '.test' - url = BASE_URL + '/' + name + '/' + filename_str + filename_str = name + ".test" + url = BASE_URL + "/" + name + "/" + filename_str filename = fetch_file( dataname=name, urlname=url, @@ -93,8 +89,8 @@ def _fetch( X_test = None y_test = None try: - filename_str = name + '.names' - url = BASE_URL + '/' + name + '/' + filename_str + filename_str = name + ".names" + url = BASE_URL + "/" + name + "/" + filename_str filename = fetch_file( dataname=name, urlname=url, @@ -102,8 +98,8 @@ def _fetch( data_home=data_home, ) except Exception: - filename_str = name + '.info' - url = BASE_URL + '/' + name + '/' + filename_str + filename_str = name + ".info" + url = BASE_URL + "/" + name + "/" + filename_str filename = fetch_file( dataname=name, urlname=url, @@ -140,10 +136,7 @@ def fetch( data_home: Optional[str] = None, *, return_X_y: bool = False, -) -> Union[ - Bunch, - Tuple[np.typing.NDArray[float], np.typing.NDArray[float]], -]: +) -> Union[Bunch, Tuple[np.typing.NDArray[float], np.typing.NDArray[float]],]: """ Fetch UCI dataset. diff --git a/skdatasets/repositories/ucr.py b/skdatasets/repositories/ucr.py index 9dbccf8..280c9f4 100644 --- a/skdatasets/repositories/ucr.py +++ b/skdatasets/repositories/ucr.py @@ -6,7 +6,6 @@ """ from __future__ import annotations -import sys from pathlib import Path from typing import Final, Literal, Optional, Sequence, Tuple, Union, overload @@ -16,7 +15,7 @@ from .base import fetch_zip as _fetch_zip -BASE_URL: Final = 'http://www.timeseriesclassification.com/Downloads/' +BASE_URL: Final = "http://www.timeseriesclassification.com/ClassificationDownloads/" def _target_conversion( @@ -37,9 +36,8 @@ def data_to_matrix( ) -> np.typing.NDArray[float]: fields = struct_array.dtype.fields assert fields - if( - len(fields.items()) == 1 - and list(fields.items())[0][1][0] == np.dtype(np.object_) + if len(fields.items()) == 1 and list(fields.items())[0][1][0] == np.dtype( + np.object_ ): attribute = struct_array[list(fields.items())[0][0]] @@ -89,10 +87,7 @@ def fetch( *, data_home: Optional[str] = None, return_X_y: bool = False, -) -> Union[ - Bunch, - Tuple[np.typing.NDArray[float], np.typing.NDArray[int]], -]: +) -> Union[Bunch, Tuple[np.typing.NDArray[float], np.typing.NDArray[int]], ]: """ Fetch UCR dataset. @@ -121,7 +116,7 @@ def fetch( data_path = _fetch_zip( name, - urlname=url + '.zip', + urlname=url + ".zip", subfolder="ucr", data_home=data_home, ) @@ -137,12 +132,12 @@ def fetch( # No description is found path_file_descr = None - path_file_train = (data_path / (name + '_TRAIN')).with_suffix(".arff") - path_file_test = (data_path / (name + '_TEST')).with_suffix(".arff") + path_file_train = (data_path / (name + "_TRAIN")).with_suffix(".arff") + path_file_test = (data_path / (name + "_TEST")).with_suffix(".arff") DESCR = ( - path_file_descr.read_text(errors='surrogateescape') - if path_file_descr else '' + path_file_descr.read_text( + errors="surrogateescape") if path_file_descr else "" ) train = scipy.io.arff.loadarff(path_file_train) test = scipy.io.arff.loadarff(path_file_test) diff --git a/skdatasets/tests/repositories/__init__.py b/skdatasets/tests/repositories/__init__.py index b8f1e3e..14d1303 100644 --- a/skdatasets/tests/repositories/__init__.py +++ b/skdatasets/tests/repositories/__init__.py @@ -14,10 +14,14 @@ def check_estimator(data): """Check that the dataset can be used to cross-validate an estimator.""" - estimator = GridSearchCV(Pipeline([('tr', StandardScaler(with_mean=False)), - ('pred', Ridge(max_iter=4))]), - {'pred__alpha': [0.33, 0.66]}, - cv=data.inner_cv, error_score=np.nan) + estimator = GridSearchCV( + Pipeline( + [("tr", StandardScaler(with_mean=False)), ("pred", Ridge(max_iter=4))] + ), + {"pred__alpha": [0.33, 0.66]}, + cv=data.inner_cv, + error_score=np.nan, + ) if data.train_indices and data.test_indices: train_indices = data.train_indices @@ -28,15 +32,11 @@ def check_estimator(data): data.data[train_indices], y=data.target[train_indices], ) - estimator.score( - data.data[data.test_indices], - y=data.target[data.test_indices] - ) + estimator.score(data.data[data.test_indices], y=data.target[data.test_indices]) else: - if hasattr(data.outer_cv, '__iter__'): + if hasattr(data.outer_cv, "__iter__"): for X, y, X_test, y_test in data.outer_cv: estimator.fit(X, y=y) estimator.score(X_test, y=y_test) else: - cross_validate(estimator, data.data, y=data.target, - cv=data.outer_cv) + cross_validate(estimator, data.data, y=data.target, cv=data.outer_cv) diff --git a/skdatasets/tests/repositories/test_cran.py b/skdatasets/tests/repositories/test_cran.py index 6101299..752c786 100644 --- a/skdatasets/tests/repositories/test_cran.py +++ b/skdatasets/tests/repositories/test_cran.py @@ -10,9 +10,9 @@ def test_cran_geyser(): """Tests CRAN geyser dataset.""" - fetch('geyser') + fetch("geyser") def test_cran_geyser_return_X_y(): """Tests CRAN geyser dataset.""" - X, y = fetch('geyser', return_X_y=True) + X, y = fetch("geyser", return_X_y=True) diff --git a/skdatasets/tests/repositories/test_forex.py b/skdatasets/tests/repositories/test_forex.py index 1651eb2..2ad0db3 100644 --- a/skdatasets/tests/repositories/test_forex.py +++ b/skdatasets/tests/repositories/test_forex.py @@ -12,21 +12,34 @@ def test_forex_usd_eur(): """Tests forex USD-EUR dataset.""" - data = fetch(start=date(2015, 1, 1), end=date(2015, 1, 31), - currency_1='USD', currency_2='EUR') + data = fetch( + start=date(2015, 1, 1), + end=date(2015, 1, 31), + currency_1="USD", + currency_2="EUR", + ) assert data.data.shape == (31, 1) def test_forex_usd_eur_return_X_y(): """Tests forex USD-EUR dataset.""" - X, y = fetch(start=date(2015, 1, 1), end=date(2015, 1, 31), - currency_1='USD', currency_2='EUR', return_X_y=True) + X, y = fetch( + start=date(2015, 1, 1), + end=date(2015, 1, 31), + currency_1="USD", + currency_2="EUR", + return_X_y=True, + ) assert X.shape == (31, 1) assert y is None def test_forex_btc_eur(): """Tests forex BTC-EUR dataset.""" - data = fetch(start=date(2015, 1, 1), end=date(2015, 1, 31), - currency_1='BTC', currency_2='EUR') + data = fetch( + start=date(2015, 1, 1), + end=date(2015, 1, 31), + currency_1="BTC", + currency_2="EUR", + ) assert data.data.shape == (31, 1) diff --git a/skdatasets/tests/repositories/test_keel.py b/skdatasets/tests/repositories/test_keel.py index ba0434e..b21e029 100644 --- a/skdatasets/tests/repositories/test_keel.py +++ b/skdatasets/tests/repositories/test_keel.py @@ -27,36 +27,36 @@ def check(data, shape, splits=1): def test_fetch_keel_abalone9_18(): """Tests Keel abalone9-18 dataset.""" - data = fetch('imbalanced', 'abalone9-18') + data = fetch("imbalanced", "abalone9-18") check(data, (731, 10)) def test_fetch_keel_abalone9_18_return_X_y(): """Tests Keel abalone9-18 dataset.""" - X, y = fetch('imbalanced', 'abalone9-18', return_X_y=True) + X, y = fetch("imbalanced", "abalone9-18", return_X_y=True) assert X.shape == (731, 10) assert y.shape == (731,) def test_fetch_keel_abalone9_18_folds(): """Tests Keel abalone9-18 dataset with folds.""" - data = fetch('imbalanced', 'abalone9-18', nfolds=5) + data = fetch("imbalanced", "abalone9-18", nfolds=5) check(data, (731, 10), splits=5) def test_fetch_keel_banana(): """Tests Keel banana dataset.""" - data = fetch('classification', 'banana') + data = fetch("classification", "banana") check(data, (5300, 2)) def test_fetch_keel_banana_folds(): """Tests Keel banana dataset with folds.""" - data = fetch('classification', 'banana', nfolds=5) + data = fetch("classification", "banana", nfolds=5) check(data, (5300, 2), splits=5) def test_fetch_keel_banana_dobscv(): """Tests Keel banana dataset with dobscv folds.""" - data = fetch('classification', 'banana', nfolds=5, dobscv=True) + data = fetch("classification", "banana", nfolds=5, dobscv=True) check(data, (5300, 2), splits=5) diff --git a/skdatasets/tests/repositories/test_keras.py b/skdatasets/tests/repositories/test_keras.py index 4dc47b9..88b7d06 100644 --- a/skdatasets/tests/repositories/test_keras.py +++ b/skdatasets/tests/repositories/test_keras.py @@ -7,8 +7,6 @@ from skdatasets.repositories.keras import fetch -from . import check_estimator - def check(data, n_samples_train, n_samples_test, n_features): """Check dataset properties.""" @@ -21,12 +19,12 @@ def check(data, n_samples_train, n_samples_test, n_features): def test_keras_mnist(): """Tests keras MNIST dataset.""" - data = fetch('mnist') + data = fetch("mnist") check(data, n_samples_train=60000, n_samples_test=10000, n_features=28 * 28) def test_keras_mnist_return_X_y(): """Tests keras MNIST dataset.""" - X, y = fetch('mnist', return_X_y=True) + X, y = fetch("mnist", return_X_y=True) assert X.shape == (70000, 28 * 28) assert y.shape == (70000,) diff --git a/skdatasets/tests/repositories/test_libsvm.py b/skdatasets/tests/repositories/test_libsvm.py index f0f7e67..dead485 100644 --- a/skdatasets/tests/repositories/test_libsvm.py +++ b/skdatasets/tests/repositories/test_libsvm.py @@ -22,12 +22,16 @@ def check( ): """Check dataset properties.""" if n_samples is None: - n_samples = sum(n for n in [ - n_samples_train, - n_samples_validation, - n_samples_test, - n_samples_remaining - ] if n is not None) + n_samples = sum( + n + for n in [ + n_samples_train, + n_samples_validation, + n_samples_test, + n_samples_remaining, + ] + if n is not None + ) assert data.data.shape == (n_samples, n_features) assert data.target.shape[0] == n_samples @@ -60,39 +64,54 @@ def check( def test_fetch_libsvm_australian(): """Tests LIBSVM australian dataset.""" - data = fetch('binary', 'australian') + data = fetch("binary", "australian") check(data, n_samples=690, n_features=14) def test_fetch_libsvm_australian_return_X_y(): """Tests LIBSVM australian dataset.""" - X, y = fetch('binary', 'australian', return_X_y=True) + X, y = fetch("binary", "australian", return_X_y=True) assert X.shape == (690, 14) assert y.shape == (690,) def test_fetch_libsvm_liver_disorders(): """Tests LIBSVM liver-disorders dataset.""" - data = fetch('binary', 'liver-disorders') + data = fetch("binary", "liver-disorders") check(data, n_samples_train=145, n_samples_test=200, n_features=5) def test_fetch_libsvm_duke(): """Tests LIBSVM duke dataset.""" - data = fetch('binary', 'duke') - check(data, n_samples_train=38, n_samples_validation=4, - n_features=7129, estimator=False) + data = fetch("binary", "duke") + check( + data, + n_samples_train=38, + n_samples_validation=4, + n_features=7129, + estimator=False, + ) def test_fetch_libsvm_cod_rna(): """Tests LIBSVM cod-rna dataset.""" - data = fetch('binary', 'cod-rna') - check(data, n_samples_train=59535, n_samples_test=271617, - n_samples_remaining=157413, n_features=8) + data = fetch("binary", "cod-rna") + check( + data, + n_samples_train=59535, + n_samples_test=271617, + n_samples_remaining=157413, + n_features=8, + ) def test_fetch_libsvm_satimage(): """Tests LIBSVM satimage dataset.""" - data = fetch('multiclass', 'satimage.scale') - check(data, n_samples_train=3104, n_samples_test=2000, - n_samples_validation=1331, n_features=36) + data = fetch("multiclass", "satimage.scale") + check( + data, + n_samples_train=3104, + n_samples_test=2000, + n_samples_validation=1331, + n_features=36, + ) diff --git a/skdatasets/tests/repositories/test_physionet.py b/skdatasets/tests/repositories/test_physionet.py index d481a1f..c1c67d5 100644 --- a/skdatasets/tests/repositories/test_physionet.py +++ b/skdatasets/tests/repositories/test_physionet.py @@ -6,7 +6,7 @@ def test_fetch_ctu_uhb_ctgdb() -> None: """Tests ctu_uhb dataset.""" X, y = fetch( - 'ctu-uhb-ctgdb', + "ctu-uhb-ctgdb", return_X_y=True, target_column=["pH", "BDecf", "pCO2", "BE", "Apgar1", "Apgar5"], ) @@ -17,7 +17,7 @@ def test_fetch_ctu_uhb_ctgdb() -> None: def test_fetch_ctu_uhb_ctgdb_single_target() -> None: """Tests ctu_uhb dataset with one target.""" X, y = fetch( - 'ctu-uhb-ctgdb', + "ctu-uhb-ctgdb", return_X_y=True, target_column="pH", ) @@ -28,7 +28,7 @@ def test_fetch_ctu_uhb_ctgdb_single_target() -> None: def test_fetch_ctu_uhb_ctgdb_bunch() -> None: """Tests ctu_uhb dataset returning Bunch.""" bunch = fetch( - 'ctu-uhb-ctgdb', + "ctu-uhb-ctgdb", as_frame=True, target_column=["pH", "BDecf", "pCO2", "BE", "Apgar1", "Apgar5"], ) @@ -40,7 +40,7 @@ def test_fetch_ctu_uhb_ctgdb_bunch() -> None: def test_fetch_macecgdb() -> None: """Tests macecgdb dataset.""" bunch = fetch( - 'macecgdb', + "macecgdb", as_frame=True, ) assert bunch.data.shape == (27, 5) diff --git a/skdatasets/tests/repositories/test_raetsch.py b/skdatasets/tests/repositories/test_raetsch.py index 0248594..9964ba9 100644 --- a/skdatasets/tests/repositories/test_raetsch.py +++ b/skdatasets/tests/repositories/test_raetsch.py @@ -20,12 +20,12 @@ def check(data, shape, splits=100): def test_fetch_raetsch_banana(): """Tests Gunnar Raetsch banana dataset.""" - data = fetch('banana') + data = fetch("banana") check(data, (5300, 2), splits=100) def test_fetch_raetsch_banana_return_X_y(): """Tests Gunnar Raetsch banana dataset.""" - X, y = fetch('banana', return_X_y=True) + X, y = fetch("banana", return_X_y=True) assert X.shape == (5300, 2) assert y.shape == (5300,) diff --git a/skdatasets/tests/repositories/test_sklearn.py b/skdatasets/tests/repositories/test_sklearn.py index 7b96bf1..c84e142 100644 --- a/skdatasets/tests/repositories/test_sklearn.py +++ b/skdatasets/tests/repositories/test_sklearn.py @@ -12,13 +12,13 @@ def test_sklearn_iris(): """Tests Scikit-learn iris dataset.""" - data = fetch('iris') + data = fetch("iris") assert data.data.shape == (150, 4) check_estimator(data) def test_sklearn_iris_return_X_y(): """Tests Scikit-learn iris dataset.""" - X, y = fetch('iris', return_X_y=True) + X, y = fetch("iris", return_X_y=True) assert X.shape == (150, 4) assert y.shape == (150,) diff --git a/skdatasets/tests/repositories/test_uci.py b/skdatasets/tests/repositories/test_uci.py index 361d496..b8ec8e3 100644 --- a/skdatasets/tests/repositories/test_uci.py +++ b/skdatasets/tests/repositories/test_uci.py @@ -10,7 +10,7 @@ def test_fetch_uci_wine(): """Tests UCI wine dataset.""" - data = fetch('wine') + data = fetch("wine") assert data.data.shape == (178, 13) assert data.target.shape[0] == data.data.shape[0] assert not data.train_indices @@ -22,6 +22,6 @@ def test_fetch_uci_wine(): def test_fetch_uci_wine_return_X_y(): """Tests UCI wine dataset.""" - X, y = fetch('wine', return_X_y=True) + X, y = fetch("wine", return_X_y=True) assert X.shape == (178, 13) assert y.shape == (178,) diff --git a/skdatasets/tests/repositories/test_ucr.py b/skdatasets/tests/repositories/test_ucr.py index f08181b..f8ca722 100644 --- a/skdatasets/tests/repositories/test_ucr.py +++ b/skdatasets/tests/repositories/test_ucr.py @@ -10,7 +10,7 @@ def test_fetch_ucr_gunpoint(): """Tests UCR GunPoint dataset.""" - data = fetch('GunPoint') + data = fetch("GunPoint") assert data.data.shape == (200, 150) assert len(data.train_indices) == 50 assert len(data.test_indices) == 150 @@ -18,14 +18,14 @@ def test_fetch_ucr_gunpoint(): def test_fetch_ucr_gunpoint_return_X_y(): """Tests UCR GunPoint dataset.""" - X, y = fetch('GunPoint', return_X_y=True) + X, y = fetch("GunPoint", return_X_y=True) assert X.shape == (200, 150) assert y.shape == (200,) def test_fetch_ucr_basicmotions(): """Tests UCR GunPoint dataset.""" - data = fetch('BasicMotions') + data = fetch("BasicMotions") assert data.data.shape == (80,) assert len(data.train_indices) == 40 assert len(data.test_indices) == 40 diff --git a/skdatasets/tests/utils/run.py b/skdatasets/tests/utils/run.py index 45d229a..3e0e93d 100755 --- a/skdatasets/tests/utils/run.py +++ b/skdatasets/tests/utils/run.py @@ -13,21 +13,27 @@ from skdatasets.utils.experiment import experiment -def main(dataset=fetch, estimator=json2estimator, - observers=[FileStorageObserver('.results')]): - parser = argparse.ArgumentParser(description='Run an experiment.') - parser.add_argument('-r', '--repository', type=str, help='repository') - parser.add_argument('-c', '--collection', type=str, default=None, - help='collection') - parser.add_argument('-d', '--dataset', type=str, help='dataset') - parser.add_argument('-e', '--estimator', type=str, help='estimator') +def main( + dataset=fetch, estimator=json2estimator, observers=[FileStorageObserver(".results")] +): + parser = argparse.ArgumentParser(description="Run an experiment.") + parser.add_argument("-r", "--repository", type=str, help="repository") + parser.add_argument("-c", "--collection", type=str, default=None, help="collection") + parser.add_argument("-d", "--dataset", type=str, help="dataset") + parser.add_argument("-e", "--estimator", type=str, help="estimator") args = parser.parse_args() e = experiment(dataset, estimator) e.observers.extend(observers) - e.run(config_updates={'dataset': {'repository': args.repository, - 'collection': args.collection, - 'dataset': args.dataset}, - 'estimator': {'estimator': args.estimator}}) + e.run( + config_updates={ + "dataset": { + "repository": args.repository, + "collection": args.collection, + "dataset": args.dataset, + }, + "estimator": {"estimator": args.estimator}, + } + ) if __name__ == "__main__": diff --git a/skdatasets/tests/utils/test_estimator.py b/skdatasets/tests/utils/test_estimator.py index 5a1c60f..3c00640 100644 --- a/skdatasets/tests/utils/test_estimator.py +++ b/skdatasets/tests/utils/test_estimator.py @@ -11,12 +11,14 @@ def test_json2estimator(): """Tests instantiation of estimator from a json file.""" import sklearn - e = json2estimator('skdatasets/tests/utils/LinearRegression.json') + + e = json2estimator("skdatasets/tests/utils/LinearRegression.json") assert type(e) == GridSearchCV def test_json2estimator_custom(): """Tests instantiation of a custom estimator from a json file.""" import skdatasets - e = json2estimator('skdatasets/tests/utils/LinearRegressionCustom.json') + + e = json2estimator("skdatasets/tests/utils/LinearRegressionCustom.json") assert type(e) == GridSearchCV diff --git a/skdatasets/tests/utils/test_experiment.py b/skdatasets/tests/utils/test_experiment.py index 49c97b4..fdc529f 100644 --- a/skdatasets/tests/utils/test_experiment.py +++ b/skdatasets/tests/utils/test_experiment.py @@ -4,10 +4,12 @@ """ from __future__ import annotations +import pytest import tempfile from typing import TYPE_CHECKING, Iterable, Tuple, Union import numpy as np +import pytest from sacred.observers import FileStorageObserver from sklearn.datasets import load_diabetes, load_iris, load_wine from sklearn.model_selection import GridSearchCV, train_test_split @@ -16,6 +18,7 @@ from sklearn.utils import Bunch from skdatasets.utils.experiment import ( + ScorerLike, create_experiments, experiment, fetch_scores, @@ -56,7 +59,7 @@ def _dataset( def _estimator(cv: CVLike) -> GridSearchCV: return GridSearchCV( DecisionTreeRegressor(), - {'max_depth': [2, 4]}, + {"max_depth": [2, 4]}, cv=cv, ) @@ -70,24 +73,27 @@ def _experiment( e.observers.append(FileStorageObserver(tmpdirname)) e.run( config_updates={ - 'dataset': { - 'inner_cv': inner_cv, - 'outer_cv': outer_cv, + "dataset": { + "inner_cv": inner_cv, + "outer_cv": outer_cv, }, }, ) +@pytest.mark.skip(reason="Waiting for Sacred to be fixed.") def test_nested_cv() -> None: """Tests nested CV experiment.""" _experiment(3, 3) +@pytest.mark.skip(reason="Waiting for Sacred to be fixed.") def test_inner_cv() -> None: """Tests inner CV experiment.""" _experiment(3, None) +@pytest.mark.skip(reason="Waiting for Sacred to be fixed.") def test_explicit_inner_folds() -> None: """Tests explicit inner folds experiment.""" X, y = load_diabetes(return_X_y=True) @@ -101,6 +107,7 @@ def test_explicit_inner_folds() -> None: ) +@pytest.mark.skip(reason="Waiting for Sacred to be fixed.") def test_explicit_outer_folds_indexes() -> None: """Tests explicit outer folds experiment.""" X, y = load_diabetes(return_X_y=True) @@ -114,6 +121,7 @@ def test_explicit_outer_folds_indexes() -> None: ) +@pytest.mark.skip(reason="Waiting for Sacred to be fixed.") def test_explicit_outer_folds() -> None: """Tests explicit outer folds experiment.""" X, y = load_diabetes(return_X_y=True) @@ -127,6 +135,7 @@ def test_explicit_outer_folds() -> None: ) +@pytest.mark.skip(reason="Waiting for Sacred to be fixed.") def test_explicit_nested_folds() -> None: """Tests explicit nested folds experiment.""" X, y = load_diabetes(return_X_y=True) @@ -144,7 +153,38 @@ def test_explicit_nested_folds() -> None: ) -def test_create_experiments_basic() -> None: +@pytest.mark.parametrize( + ["scoring", "expected_mean", "expected_std"], + [ + ( + None, + [ + [0.96666667, 0.97333333, 0.98], + [0.70285714, 0.69126984, 0.68063492], + ], + [ + [0.02108185, 0.02494438, 0.01632993], + [0.07920396, 0.04877951, 0.0662983], + ], + ), + ( + "recall_micro", + [ + [0.96666667, 0.97333333, 0.98], + [0.70285714, 0.69126984, 0.68063492], + ], + [ + [0.02108185, 0.02494438, 0.01632993], + [0.07920396, 0.04877951, 0.0662983], + ], + ), + ], +) +def test_create_experiments_basic( + scoring: ScorerLike[np.typing.NDArray[np.float_], np.typing.NDArray[np.int_]], + expected_mean: np.typing.NDArray[np.float_], + expected_std: np.typing.NDArray[np.float_], +) -> None: with tempfile.TemporaryDirectory() as tmpdirname: experiments = create_experiments( @@ -157,6 +197,7 @@ def test_create_experiments_basic() -> None: "iris": load_iris(), "wine": load_wine(), }, + scoring=scoring, storage=tmpdirname, ) @@ -171,16 +212,10 @@ def test_create_experiments_basic() -> None: assert scores.estimator_names == ("knn-3", "knn-5", "knn-7") np.testing.assert_allclose( scores.scores_mean, - [ - [0.96666667, 0.97333333, 0.98], - [0.70285714, 0.69126984, 0.68063492], - ], + expected_mean, ) np.testing.assert_allclose( scores.scores_std, - [ - [0.02108185, 0.02494438, 0.01632993], - [0.07920396, 0.04877951, 0.0662983], - ], + expected_std, rtol=1e-6, ) diff --git a/skdatasets/tests/utils/test_run.py b/skdatasets/tests/utils/test_run.py index 5781bb3..87fa39a 100644 --- a/skdatasets/tests/utils/test_run.py +++ b/skdatasets/tests/utils/test_run.py @@ -8,40 +8,117 @@ def test_binary_classification(): """Tests binary classification experiment.""" - ret = subprocess.call(['skdatasets/tests/utils/run.py', '-r', 'keel', '-c', - 'imbalanced', '-d', 'abalone9-18', '-e', - 'skdatasets/tests/utils/MLPClassifier.json']) + ret = subprocess.call( + [ + "skdatasets/tests/utils/run.py", + "-r", + "keel", + "-c", + "imbalanced", + "-d", + "abalone9-18", + "-e", + "skdatasets/tests/utils/MLPClassifier.json", + ] + ) assert ret >= 0 - ret = subprocess.call(['skdatasets/tests/utils/run.py', '-r', 'libsvm', '-c', 'binary', - '-d', 'breast-cancer', '-e', - 'skdatasets/tests/utils/MLPClassifier.json']) + ret = subprocess.call( + [ + "skdatasets/tests/utils/run.py", + "-r", + "libsvm", + "-c", + "binary", + "-d", + "breast-cancer", + "-e", + "skdatasets/tests/utils/MLPClassifier.json", + ] + ) assert ret >= 0 - ret = subprocess.call(['skdatasets/tests/utils/run.py', '-r', 'raetsch', '-d', - 'banana', '-e', 'skdatasets/tests/utils/MLPClassifier.json']) + ret = subprocess.call( + [ + "skdatasets/tests/utils/run.py", + "-r", + "raetsch", + "-d", + "banana", + "-e", + "skdatasets/tests/utils/MLPClassifier.json", + ] + ) assert ret >= 0 def test_multiclass_classification(): """Tests multiclass classification experiment.""" - ret = subprocess.call(['skdatasets/tests/utils/run.py', '-r', 'sklearn', '-d', 'iris', - '-e', 'skdatasets/tests/utils/MLPClassifier.json']) + ret = subprocess.call( + [ + "skdatasets/tests/utils/run.py", + "-r", + "sklearn", + "-d", + "iris", + "-e", + "skdatasets/tests/utils/MLPClassifier.json", + ] + ) assert ret >= 0 - ret = subprocess.call(['skdatasets/tests/utils/run.py', '-r', 'uci', '-d', 'wine', - '-e', 'skdatasets/tests/utils/MLPClassifier.json']) + ret = subprocess.call( + [ + "skdatasets/tests/utils/run.py", + "-r", + "uci", + "-d", + "wine", + "-e", + "skdatasets/tests/utils/MLPClassifier.json", + ] + ) assert ret >= 0 - ret = subprocess.call(['skdatasets/tests/utils/run.py', '-r', 'libsvm', '-c', - 'multiclass', '-d', 'shuttle', '-e', - 'skdatasets/tests/utils/MLPClassifier.json']) + ret = subprocess.call( + [ + "skdatasets/tests/utils/run.py", + "-r", + "libsvm", + "-c", + "multiclass", + "-d", + "shuttle", + "-e", + "skdatasets/tests/utils/MLPClassifier.json", + ] + ) assert ret >= 0 - ret = subprocess.call(['skdatasets/tests/utils/run.py', '-r', 'libsvm', '-c', - 'multiclass', '-d', 'usps', '-e', - 'skdatasets/tests/utils/MLPClassifier.json']) + ret = subprocess.call( + [ + "skdatasets/tests/utils/run.py", + "-r", + "libsvm", + "-c", + "multiclass", + "-d", + "usps", + "-e", + "skdatasets/tests/utils/MLPClassifier.json", + ] + ) assert ret >= 0 def test_regression(): """Tests regression experiment.""" - ret = subprocess.call(['skdatasets/tests/utils/run.py', '-r', 'libsvm', '-c', - 'regression', '-d', 'housing', '-e', - 'skdatasets/tests/utils/MLPRegressor.json']) + ret = subprocess.call( + [ + "skdatasets/tests/utils/run.py", + "-r", + "libsvm", + "-c", + "regression", + "-d", + "housing", + "-e", + "skdatasets/tests/utils/MLPRegressor.json", + ] + ) assert ret >= 0 diff --git a/skdatasets/tests/utils/test_scores.py b/skdatasets/tests/utils/test_scores.py index b3687c7..687fdda 100644 --- a/skdatasets/tests/utils/test_scores.py +++ b/skdatasets/tests/utils/test_scores.py @@ -7,23 +7,45 @@ from skdatasets.utils.scores import hypotheses_table, scores_table -datasets = ['a4a', 'a8a', 'combined', 'dna', 'ijcnn1', 'letter', 'pendigits', - 'satimage', 'shuttle', 'usps', 'w7a', 'w8a'] -estimators = ['LogisticRegression', 'MLPClassifier0', 'MLPClassifier1', - 'MLPClassifier2', 'MLPClassifier3', 'MLPClassifier4', - 'MLPClassifier5'] -scores = np.asarray(((89.79, 89.78, 89.76, 89.88, 89.85, 89.91, 89.93), - (90.73, 90.73, 90.73, 90.85, 90.83, 90.81, 90.80), - (92.36, 92.31, 94.58, 94.82, 94.84, 94.92, 94.89), - (99.28, 99.27, 99.28, 99.26, 99.27, 99.25, 99.25), - (91.34, 91.34, 99.29, 99.33, 99.34, 99.53, 99.54), - (98.07, 98.04, 99.94, 99.95, 99.96, 99.96, 99.95), - (99.17, 99.08, 99.87, 99.87, 99.88, 99.90, 99.89), - (96.67, 96.28, 98.84, 98.87, 98.90, 98.87, 98.92), - (95.85, 92.83, 99.88, 99.93, 99.96, 99.98, 99.99), - (99.12, 99.11, 99.65, 99.58, 99.58, 99.65, 99.60), - (95.93, 95.40, 94.58, 96.31, 96.34, 96.58, 96.50), - (95.80, 95.99, 95.35, 96.20, 96.22, 96.36, 96.71))) +datasets = [ + "a4a", + "a8a", + "combined", + "dna", + "ijcnn1", + "letter", + "pendigits", + "satimage", + "shuttle", + "usps", + "w7a", + "w8a", +] +estimators = [ + "LogisticRegression", + "MLPClassifier0", + "MLPClassifier1", + "MLPClassifier2", + "MLPClassifier3", + "MLPClassifier4", + "MLPClassifier5", +] +scores = np.asarray( + ( + (89.79, 89.78, 89.76, 89.88, 89.85, 89.91, 89.93), + (90.73, 90.73, 90.73, 90.85, 90.83, 90.81, 90.80), + (92.36, 92.31, 94.58, 94.82, 94.84, 94.92, 94.89), + (99.28, 99.27, 99.28, 99.26, 99.27, 99.25, 99.25), + (91.34, 91.34, 99.29, 99.33, 99.34, 99.53, 99.54), + (98.07, 98.04, 99.94, 99.95, 99.96, 99.96, 99.95), + (99.17, 99.08, 99.87, 99.87, 99.88, 99.90, 99.89), + (96.67, 96.28, 98.84, 98.87, 98.90, 98.87, 98.92), + (95.85, 92.83, 99.88, 99.93, 99.96, 99.98, 99.99), + (99.12, 99.11, 99.65, 99.58, 99.58, 99.65, 99.60), + (95.93, 95.40, 94.58, 96.31, 96.34, 96.58, 96.50), + (95.80, 95.99, 95.35, 96.20, 96.22, 96.36, 96.71), + ) +) def test_scores_table() -> None: @@ -39,12 +61,25 @@ def test_scores_table() -> None: def test_hypotheses_table() -> None: """Tests hypotheses table.""" - for multitest in ('kruskal', 'friedmanchisquare', None): - for test in ('mannwhitneyu', 'wilcoxon'): - hypotheses_table(scores, estimators, - multitest=multitest, test=test) - for correction in ('bonferroni', 'sidak', 'holm-sidak', 'holm', - 'simes-hochberg', 'hommel', 'fdr_bh', 'fdr_by', - 'fdr_tsbh', 'fdr_tsbky'): - hypotheses_table(scores, estimators, multitest=multitest, - test=test, correction=correction) + for multitest in ("kruskal", "friedmanchisquare", None): + for test in ("mannwhitneyu", "wilcoxon"): + hypotheses_table(scores, estimators, multitest=multitest, test=test) + for correction in ( + "bonferroni", + "sidak", + "holm-sidak", + "holm", + "simes-hochberg", + "hommel", + "fdr_bh", + "fdr_by", + "fdr_tsbh", + "fdr_tsbky", + ): + hypotheses_table( + scores, + estimators, + multitest=multitest, + test=test, + correction=correction, + ) diff --git a/skdatasets/utils/estimator.py b/skdatasets/utils/estimator.py index 0a07491..33f9952 100644 --- a/skdatasets/utils/estimator.py +++ b/skdatasets/utils/estimator.py @@ -25,7 +25,7 @@ def json2estimator(estimator, **kwargs): Instantiated Scikit-learn estimator. """ - with open(estimator, 'r') as definition: + with open(estimator, "r") as definition: estimator = jsonpickle.decode(definition.read()) for k, v in kwargs.items(): setattr(estimator, k, v) diff --git a/skdatasets/utils/experiment.py b/skdatasets/utils/experiment.py index 2d8a700..932dccf 100755 --- a/skdatasets/utils/experiment.py +++ b/skdatasets/utils/experiment.py @@ -5,13 +5,9 @@ from __future__ import annotations import itertools -import os -import sys from contextlib import contextmanager from dataclasses import dataclass -from inspect import signature -from tempfile import NamedTemporaryFile, mkdtemp -from time import perf_counter, process_time, sleep +from time import perf_counter, sleep from typing import ( Any, Callable, @@ -29,13 +25,13 @@ ) from warnings import warn -import joblib import numpy as np from sacred import Experiment, Ingredient from sacred.observers import FileStorageObserver, MongoObserver, RunObserver from sklearn.base import BaseEstimator, is_classifier +from sklearn.metrics import check_scoring from sklearn.model_selection import check_cv -from sklearn.utils import Bunch, is_scalar_nan +from sklearn.utils import Bunch from incense import ExperimentLoader, FileSystemExperimentLoader from incense.experiment import FileSystemExperiment @@ -44,7 +40,6 @@ class DataLike(Protocol): - def __getitem__( self: SelfType, key: np.typing.NDArray[int], @@ -69,10 +64,14 @@ def __len__(self) -> int: Mapping[str, Any], str, ] +ScorerLike = Union[ + str, + Callable[[BaseEstimator, DataType, TargetType], float], + None, +] class EstimatorProtocol(Protocol[DataType, TargetType]): - def fit(self: SelfType, X: DataType, y: TargetType) -> SelfType: pass @@ -81,7 +80,6 @@ def predict(self, X: DataType) -> TargetType: class CVSplitter(Protocol): - def split( self, X: np.typing.NDArray[float], @@ -172,9 +170,7 @@ def _add_timing(experiment: Experiment, name: str) -> Iterator[None]: def _iterate_outer_cv( - outer_cv: CVLike | Iterable[ - Tuple[DataType, TargetType, DataType, TargetType] - ], + outer_cv: CVLike | Iterable[Tuple[DataType, TargetType, DataType, TargetType]], estimator: EstimatorProtocol[DataType, TargetType], X: DataType, y: TargetType, @@ -187,8 +183,7 @@ def _iterate_outer_cv( cv = check_cv(outer_cv, y, classifier=is_classifier(estimator)) yield from ( - (X[train], y[train], X[test], y[test]) - for train, test in cv.split(X, y) + (X[train], y[train], X[test], y[test]) for train, test in cv.split(X, y) ) @@ -200,9 +195,13 @@ def _benchmark_from_data( y_train: TargetType, X_test: DataType, y_test: TargetType, + scoring: ScorerLike[DataType, TargetType] = None, save_estimator: bool = False, save_train: bool = False, ) -> None: + + scoring_fun = check_scoring(estimator, scoring) + with _add_timing(experiment, "fit_time"): estimator.fit(X_train, y_train) @@ -218,12 +217,12 @@ def _benchmark_from_data( _append_info(experiment, "search_best_score", best_score) with _add_timing(experiment, "score_time"): - test_score = estimator.score(X_test, y_test) + test_score = scoring_fun(estimator, X_test, y_test) _append_info(experiment, "test_score", float(test_score)) if save_train: - train_score = estimator.score(X_train, y_train) + train_score = scoring_fun(estimator, X_train, y_train) _append_info(experiment, "train_score", float(train_score)) for output in ("transform", "predict"): @@ -235,12 +234,8 @@ def _benchmark_from_data( def _compute_means(experiment: Experiment) -> None: - experiment.info["score_mean"] = float( - np.nanmean(experiment.info["test_score"]) - ) - experiment.info["score_std"] = float( - np.nanstd(experiment.info["test_score"]) - ) + experiment.info["score_mean"] = float(np.nanmean(experiment.info["test_score"])) + experiment.info["score_std"] = float(np.nanstd(experiment.info["test_score"])) def _benchmark_one( @@ -248,6 +243,7 @@ def _benchmark_one( *, estimator: BaseEstimator, data: Bunch, + scoring: ScorerLike[DataType, TargetType] = None, save_estimator: bool = False, save_train: bool = False, ) -> None: @@ -259,16 +255,8 @@ def _benchmark_one( validation_indices = getattr(data, "validation_indices", []) test_indices = getattr(data, "test_indices", []) - X_train_val = ( - X[train_indices + validation_indices] - if train_indices - else X - ) - y_train_val = ( - y[train_indices + validation_indices] - if train_indices - else y - ) + X_train_val = X[train_indices + validation_indices] if train_indices else X + y_train_val = y[train_indices + validation_indices] if train_indices else y X_test = X[test_indices] y_test = y[test_indices] @@ -280,6 +268,7 @@ def _benchmark_one( y_train=y_train_val, X_test=X_test, y_test=y_test, + scoring=scoring, save_estimator=save_estimator, save_train=save_train, ) @@ -292,6 +281,7 @@ def _benchmark_partitions( *, estimator: BaseEstimator, data: Bunch, + scoring: ScorerLike[DataType, TargetType] = None, save_estimator: bool = False, save_train: bool = False, outer_cv: CVLike | Literal["dataset"] = None, @@ -313,6 +303,7 @@ def _benchmark_partitions( y_train=y_train, X_test=X_test, y_test=y_test, + scoring=scoring, save_estimator=save_estimator, save_train=save_train, ) @@ -325,6 +316,7 @@ def _benchmark( *, estimator: BaseEstimator, data: Bunch, + scoring: ScorerLike[DataType, TargetType] = None, save_estimator: bool = False, save_train: bool = False, outer_cv: CVLike | Literal[False, "dataset"] = None, @@ -335,6 +327,7 @@ def _benchmark( experiment=experiment, estimator=estimator, data=data, + scoring=scoring, save_estimator=save_estimator, save_train=save_train, ) @@ -343,6 +336,7 @@ def _benchmark( experiment=experiment, estimator=estimator, data=data, + scoring=scoring, save_estimator=save_estimator, save_train=save_train, outer_cv=outer_cv, @@ -353,6 +347,7 @@ def experiment( dataset: Callable[..., Bunch], estimator: Callable[..., BaseEstimator], *, + scoring: ScorerLike[DataType, TargetType] = None, save_estimator: bool = False, save_train: bool = False, ) -> Experiment: @@ -409,6 +404,7 @@ def run() -> None: experiment=experiment, estimator=e, data=data, + scoring=scoring, save_estimator=save_estimator, save_train=save_train, ) @@ -426,8 +422,10 @@ def _get_estimator_function( ) -> Callable[..., EstimatorProtocol[Any, Any]]: if hasattr(estimator, "fit"): + def estimator_function() -> EstimatorProtocol: return estimator + else: estimator_function = estimator @@ -442,6 +440,7 @@ def _get_dataset_function( if callable(dataset): dataset_function = dataset else: + def dataset_function() -> Bunch: return dataset @@ -458,6 +457,7 @@ def _create_one_experiment( config: ConfigLike, inner_cv: CVLike | Literal[False, "dataset"] = None, outer_cv: CVLike | Literal[False, "dataset"] = None, + scoring: ScorerLike[DataType, TargetType] = None, save_estimator: bool = False, save_train: bool = False, ) -> Experiment: @@ -497,6 +497,7 @@ def run() -> None: experiment=experiment, estimator=estimator, data=dataset, + scoring=scoring, save_estimator=save_estimator, save_train=save_train, outer_cv=outer_cv, @@ -513,6 +514,7 @@ def create_experiments( config: ConfigLike | None = None, inner_cv: CVLike | Literal[False, "dataset"] = False, outer_cv: CVLike | Literal[False, "dataset"] = None, + scoring: ScorerLike[DataType, TargetType] = None, save_estimator: bool = False, save_train: bool = False, ) -> Sequence[Experiment]: @@ -574,6 +576,10 @@ def create_experiments( * Otherwise, this will be passed to :external:func:`sklearn.model_selection.check_cv` and the resulting cross validator will be used to define the partitions. + scoring : string, callable or ``None``, default ``None`` + Scoring method used to measure the performance of the estimator. + If a callable, it should have the signature `scorer(estimator, X, y)`. + If ``None`` it uses the ``scorer`` method of the estimator. save_estimator : bool, default ``False`` Whether to save the fitted estimator. This is useful for debugging and for obtaining extra information in some cases, but for some @@ -608,6 +614,7 @@ def create_experiments( config=config, inner_cv=inner_cv, outer_cv=outer_cv, + scoring=scoring, save_estimator=save_estimator, save_train=save_train, ) @@ -677,7 +684,8 @@ def _get_experiments( if ( (ids, dataset_names, estimator_names) == (None, None, None) - or isinstance(loader, FileSystemExperimentLoader) and ids is None + or isinstance(loader, FileSystemExperimentLoader) + and ids is None ): find_all_fun = getattr( loader, @@ -690,16 +698,14 @@ def _get_experiments( experiments = find_all_fun() - elif ( - (dataset_names, estimator_names) == (None, None) - or isinstance(loader, FileSystemExperimentLoader) + elif (dataset_names, estimator_names) == (None, None) or isinstance( + loader, FileSystemExperimentLoader ): load_ids_fun = getattr( loader, "find_by_ids", lambda id_seq: [ - loader.find_by_id(experiment_id) - for experiment_id in id_seq + loader.find_by_id(experiment_id) for experiment_id in id_seq ], ) @@ -718,8 +724,7 @@ def _get_experiments( conditions.append({"_id": {"$in": ids}}) if estimator_names is not None: - conditions.append( - {"config.estimator_name": {"$in": estimator_names}}) + conditions.append({"config.estimator_name": {"$in": estimator_names}}) if dataset_names is not None: conditions.append({"config.dataset_name": {"$in": dataset_names}}) @@ -731,16 +736,14 @@ def _get_experiments( if isinstance(loader, FileSystemExperimentLoader): # Filter experiments by dataset and estimator names experiments = [ - e for e in experiments + e + for e in experiments if ( ( estimator_names is None or e.config["estimator_name"] in estimator_names ) - and ( - dataset_names is None - or e.config["dataset_name"] in dataset_names - ) + and (dataset_names is None or e.config["dataset_name"] in dataset_names) ) ] @@ -842,9 +845,7 @@ def fetch_scores( estimator_names = ( tuple(estimator_list) if estimator_names is None else estimator_names ) - dataset_names = ( - tuple(dataset_list) if dataset_names is None else dataset_names - ) + dataset_names = tuple(dataset_list) if dataset_names is None else dataset_names matrix_shape = (len(dataset_names), len(estimator_names)) scores = np.full(matrix_shape + (nobs,), np.nan) diff --git a/skdatasets/utils/scores.py b/skdatasets/utils/scores.py index f2e21c6..c48fcb2 100644 --- a/skdatasets/utils/scores.py +++ b/skdatasets/utils/scores.py @@ -7,17 +7,7 @@ import itertools as it from dataclasses import dataclass from functools import reduce -from typing import ( - Any, - Callable, - Iterable, - Literal, - Mapping, - Optional, - Sequence, - Tuple, - overload, -) +from typing import Any, Callable, Literal, Mapping, Optional, Sequence, Tuple import numpy as np import pandas as pd @@ -33,21 +23,21 @@ CorrectionLike = Literal[ None, - 'bonferroni', - 'sidak', - 'holm-sidak', - 'holm', - 'simes-hochberg', - 'hommel', - 'fdr_bh', - 'fdr_by', - 'fdr_tsbh', - 'fdr_tsbky', + "bonferroni", + "sidak", + "holm-sidak", + "holm", + "simes-hochberg", + "hommel", + "fdr_bh", + "fdr_by", + "fdr_tsbh", + "fdr_tsbky", ] -MultitestLike = Literal['kruskal', 'friedmanchisquare'] +MultitestLike = Literal["kruskal", "friedmanchisquare"] -TestLike = Literal['mannwhitneyu', 'wilcoxon'] +TestLike = Literal["mannwhitneyu", "wilcoxon"] @dataclass @@ -234,8 +224,8 @@ def _set_style_formatter( styler: pd.io.formats.style.Styler, *, precision: int, + show_rank: bool = True, ) -> pd.io.formats.style.Styler: - def _formatter( data: object, ) -> str: @@ -246,10 +236,12 @@ def _formatter( elif isinstance(data, float): return f"{data:.{precision}f}" elif isinstance(data, ScoreCell): - str_repr = f'{data.mean:.{precision}f}' + str_repr = f"{data.mean:.{precision}f}" if data.std is not None: - str_repr += f' ± {data.std:.{precision}f}' - str_repr += f' ({data.rank:.0f})' + str_repr += f" ± {data.std:.{precision}f}" + if show_rank: + precision_rank = 0 if isinstance(data.rank, int) else precision + str_repr += f" ({data.rank:.{precision_rank}f})" return str_repr else: return "" @@ -285,7 +277,7 @@ def _set_default_style_html( { "selector": ".significant::after", "props": [ - ("content", "\"*\""), + ("content", '"*"'), ("width", "0px"), ("display", "inline-block"), ], @@ -345,25 +337,25 @@ def _set_default_style_latex( styler.set_table_styles( [ { - 'selector': r'newcommand{\summary}', - 'props': r':[1]{\textit{#1}};', + "selector": r"newcommand{\summary}", + "props": r":[1]{\textit{#1}};", }, { - 'selector': r'newcommand{\significant}', - 'props': r':[1]{#1*};', + "selector": r"newcommand{\significant}", + "props": r":[1]{#1*};", }, { - 'selector': r'newcommand{\rank}', - 'props': ( - r':[2]{\ifnum#1=1 \textbf{#2} \else ' - r'\ifnum#1=2 \underline{#2} \fi\fi};' + "selector": r"newcommand{\rank}", + "props": ( + r":[2]{\ifnum#1=1 \textbf{#2} \else " + r"\ifnum#1=2 \underline{#2} \else #2 \fi\fi};" ), }, ], overwrite=False, ) - for rank in range(styler.data.shape[1]): + for rank in range(1, styler.data.shape[1] + 1): styler = _set_style_from_class( styler, f"rank{rank}", @@ -423,12 +415,13 @@ def scores_table( estimators: Sequence[str], nobs: int | None = None, greater_is_better: bool = True, - method: Literal['average', 'min', 'max', 'dense', 'ordinal'] = 'min', + method: Literal["average", "min", "max", "dense", "ordinal"] = "min", significancy_level: float = 0, paired_test: bool = False, two_sided: bool = True, default_style: Literal["html", "latex", None] = "html", precision: int = 2, + show_rank: bool = True, summary_rows: Sequence[Tuple[str, Callable[..., SummaryRow]]] = ( ("Average rank", average_rank), ), @@ -502,12 +495,14 @@ def scores_table( stds = np.std(scores, axis=-1) nobs = scores.shape[-1] - ranks = np.asarray([ - rankdata(-m, method=method) - if greater_is_better - else rankdata(m, method=method) - for m in means - ]) + ranks = np.asarray( + [ + rankdata(-m, method=method) + if greater_is_better + else rankdata(m, method=method) + for m in means.round(precision) + ] + ) significants = _all_significants( scores, @@ -526,7 +521,7 @@ def scores_table( table.loc[d, e] = ScoreCell( mean=means[i, j], std=None if stds is None else stds[i, j], - rank=ranks[i, j], + rank=int(ranks[i, j]), significant=significants[i, j], ) @@ -561,6 +556,7 @@ def scores_table( styler = _set_style_formatter( styler, precision=precision, + show_rank=show_rank, ) return _set_default_style( @@ -576,7 +572,7 @@ def hypotheses_table( *, alpha: float = 0.05, multitest: Optional[MultitestLike] = None, - test: TestLike = 'wilcoxon', + test: TestLike = "wilcoxon", correction: CorrectionLike = None, multitest_args: Optional[Mapping[str, Any]] = None, test_args: Optional[Mapping[str, Any]] = None, @@ -627,31 +623,30 @@ def hypotheses_table( versus = list(it.combinations(range(len(models)), 2)) comparisons = [ - f"{models[first]} vs {models[second]}" - for first, second in versus + f"{models[first]} vs {models[second]}" for first, second in versus ] multitests = { - 'kruskal': kruskal, - 'friedmanchisquare': friedmanchisquare, + "kruskal": kruskal, + "friedmanchisquare": friedmanchisquare, } tests = { - 'mannwhitneyu': mannwhitneyu, - 'wilcoxon': wilcoxon, + "mannwhitneyu": mannwhitneyu, + "wilcoxon": wilcoxon, } multitest_table = None if multitest is not None: multitest_table = pd.DataFrame( index=[multitest], - columns=['p-value', 'Hypothesis'], + columns=["p-value", "Hypothesis"], ) _, pvalue = multitests[multitest]( *samples.T, **multitest_args, ) - reject_str = 'Rejected' if pvalue <= alpha else 'Not rejected' - multitest_table.loc[multitest] = ['{0:.2f}'.format(pvalue), reject_str] + reject_str = "Rejected" if pvalue <= alpha else "Not rejected" + multitest_table.loc[multitest] = ["{0:.2f}".format(pvalue), reject_str] # If the multitest does not detect a significative difference, # the individual tests are not meaningful, so skip them. @@ -663,7 +658,8 @@ def hypotheses_table( samples[:, first], samples[:, second], **test_args, - )[1] for first, second in versus + )[1] + for first, second in versus ] if correction is not None: @@ -672,29 +668,18 @@ def hypotheses_table( alpha, method=correction, ) - reject = [ - 'Rejected' - if r - else 'Not rejected' - for r in reject_bool - ] + reject = ["Rejected" if r else "Not rejected" for r in reject_bool] else: reject = [ - 'Rejected' - if pvalue <= alpha - else 'Not rejected' - for pvalue in pvalues + "Rejected" if pvalue <= alpha else "Not rejected" for pvalue in pvalues ] - data = [ - ('{0:.2f}'.format(p), r) - for p, r in zip(pvalues, reject) - ] + data = [("{0:.2f}".format(p), r) for p, r in zip(pvalues, reject)] test_table = pd.DataFrame( data, index=comparisons, - columns=['p-value', 'Hypothesis'], + columns=["p-value", "Hypothesis"], ) return multitest_table, test_table