From e13b514581a1b16d2ec40fbca3069aa5a578035d Mon Sep 17 00:00:00 2001 From: Marek Novotny Date: Thu, 19 Oct 2023 21:15:32 +0200 Subject: [PATCH 01/43] [GH-15654] Introduce MLFlow flavors for working with mojos and pojos --- h2o-py-mlflow-flavors/.gitignore | 44 +++++++++++++++ h2o-py-mlflow-flavors/DESCRIPTION.rst | 4 ++ h2o-py-mlflow-flavors/README.md | 3 + h2o-py-mlflow-flavors/build.gradle | 79 ++++++++++++++++++++++++++ h2o-py-mlflow-flavors/setup.cfg | 20 +++++++ h2o-py-mlflow-flavors/setup.py | 81 +++++++++++++++++++++++++++ 6 files changed, 231 insertions(+) create mode 100644 h2o-py-mlflow-flavors/.gitignore create mode 100644 h2o-py-mlflow-flavors/DESCRIPTION.rst create mode 100644 h2o-py-mlflow-flavors/README.md create mode 100644 h2o-py-mlflow-flavors/build.gradle create mode 100644 h2o-py-mlflow-flavors/setup.cfg create mode 100644 h2o-py-mlflow-flavors/setup.py diff --git a/h2o-py-mlflow-flavors/.gitignore b/h2o-py-mlflow-flavors/.gitignore new file mode 100644 index 000000000000..70a02e1541b2 --- /dev/null +++ b/h2o-py-mlflow-flavors/.gitignore @@ -0,0 +1,44 @@ +# Backup files +*.~ + +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] + +# C extensions +*.so + +# Distribution / packaging +bin/ +build/ +develop-eggs/ +dist/ +eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# Tests +tests/results/* +tests/*/results/* +tests/*/*/results/* + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +.tox/ +.coverage +.cache +nosetests.xml +coverage.xml + +# Translations +*.mo diff --git a/h2o-py-mlflow-flavors/DESCRIPTION.rst b/h2o-py-mlflow-flavors/DESCRIPTION.rst new file mode 100644 index 000000000000..5831785a3014 --- /dev/null +++ b/h2o-py-mlflow-flavors/DESCRIPTION.rst @@ -0,0 +1,4 @@ +H2O-3 MLFlow Flavors +==================== + +A tiny library containing MLFlow flavors for H2O-3 MOJO and POJO models. diff --git a/h2o-py-mlflow-flavors/README.md b/h2o-py-mlflow-flavors/README.md new file mode 100644 index 000000000000..cd14c59f0b5e --- /dev/null +++ b/h2o-py-mlflow-flavors/README.md @@ -0,0 +1,3 @@ +# H2O-3 MLFlow Flavors + +A tiny library containing [MLFlow](https://mlflow.org/) flavors for H2O-3 MOJO and POJO models. diff --git a/h2o-py-mlflow-flavors/build.gradle b/h2o-py-mlflow-flavors/build.gradle new file mode 100644 index 000000000000..b6689e594e62 --- /dev/null +++ b/h2o-py-mlflow-flavors/build.gradle @@ -0,0 +1,79 @@ +description = "H2O-3 MLFlow Flavors" + +dependencies {} + +def buildVersion = new H2OBuildVersion(rootDir, version) + +ext { + PROJECT_VERSION = buildVersion.getProjectVersion() + pythonexe = findProperty("pythonExec") ?: "python" + pipexe = findProperty("pipExec") ?: "pip" + if (System.env.VIRTUAL_ENV) { + pythonexe = "${System.env.VIRTUAL_ENV}/bin/python".toString() + pipexe = "${System.env.VIRTUAL_ENV}/bin/pip".toString() + } + testsPath = file("tests") +} + +// +// Create a file with version for Python dist task +// +task createVersionFiles() { + doLast { + file("${buildDir}/h2o/mlflow/").mkdirs() + File version_file = new File("${buildDir}/h2o-mlflow-flavors/", "version.txt") + version_file.write(PROJECT_VERSION) + + File build_file = new File("${buildDir}/h2o-mlflow-flavors/", "buildinfo.txt") + build_file.write(buildVersion.toString()) + } +} + + +task copySrcFiles(type: Copy) { + from ("${projectDir}") { + include "setup.py" + include "setup.cfg" + include "h2o-mlflow-flavors/**" + include "README.md" + include "DESCRIPTION.rst" + } + into "${buildDir}" +} + +task buildDist(type: Exec, dependsOn: [createVersionFiles, copySrcFiles]) { + workingDir buildDir + doFirst { + file("${buildDir}/tmp").mkdirs() + standardOutput = new FileOutputStream(file("${buildDir}/tmp/h2o-mlflow-flavors_buildDist.out")) + } + commandLine getOsSpecificCommandLine([pythonexe, "setup.py", "bdist_wheel"]) +} + +task copyMainDist(type: Copy, dependsOn: [buildDist]) { + from ("${buildDir}/main/") { + include "dist/**" + } + into "${buildDir}" +} + +task pythonVersion(type: Exec) { + doFirst { + println(System.env.VIRTUAL_ENV) + println(environment) + } + commandLine getOsSpecificCommandLine([pythonexe, "--version"]) +} + +task cleanBuild(type: Delete) { + doFirst { + println "Cleaning..." + } + delete file("build/") +} + +// +// Define the dependencies +// +clean.dependsOn cleanBuild +build.dependsOn copyMainDist diff --git a/h2o-py-mlflow-flavors/setup.cfg b/h2o-py-mlflow-flavors/setup.cfg new file mode 100644 index 000000000000..b4f33f8e0459 --- /dev/null +++ b/h2o-py-mlflow-flavors/setup.cfg @@ -0,0 +1,20 @@ +[flake8] +# +# E241: (Multiple spaces after ':' or ',') Occasionally aligning code fragments vertically improves readability +# E265: (Block comment should start with '# ') I like having banner comments of the form #--------------------- +# E302: (Functions should be separated with 2 blank lines) PEP8 says that sometimes groups of related functions may be +# separated with 3 lines to improve readability. We do that. +# E303: (Classes should be separated with ? blank lines) "Spare is better than dense". Extra separators don't hurt. +# E701: (Multiple statements on the same line) PEP8 allows multiple statements on the same line in certain situations, +# for example `if foo: continue` is more readable in 1 line than in 2. +# +# D105: (Missing docstring in magic method) Magic methods have well-defined meaning, docstrings are redundant. +# +ignore = E241,E265,E302,E303,E701,D105 +max-line-length = 120 +application-import-names = h2o-mlflow-flavors +import-order-style = smarkets +inline-quotes = " + +[bdist_wheel] +universal = 1 diff --git a/h2o-py-mlflow-flavors/setup.py b/h2o-py-mlflow-flavors/setup.py new file mode 100644 index 000000000000..68d682376f09 --- /dev/null +++ b/h2o-py-mlflow-flavors/setup.py @@ -0,0 +1,81 @@ +# -*- encoding: utf-8 -*- +from setuptools import setup, find_packages +from codecs import open +import os +import sys +import shutil + +here = os.path.abspath(os.path.dirname(__file__)) + +# Get the long description from the relevant file +with open(os.path.join(here, 'DESCRIPTION.rst'), encoding='utf-8') as f: + long_description = f.read() + +version = "0.0.local" +# Get the version from the relevant file +with open(os.path.join(here, 'h2o_mlflow_flavors/version.txt'), encoding='utf-8') as f: + version = f.read() + +packages = find_packages(exclude=["tests*"]) +print("Found packages: %r" % packages) + +setup( + name='h2o_mlflow_flavors', + + # Versions should comply with PEP440. For a discussion on single-sourcing + # the version across setup.py and the project code, see + # https://packaging.python.org/en/latest/single_source_version.html + version = version, + + description='Collection of extensions for integration of H2O-3 with H2O.ai Cloud', + long_description=long_description, + + # The project's main homepage. + url='https://github.com/h2oai/h2o-3.git', + + # Author details + author='H2O.ai', + author_email='support@h2o.ai', + + # Choose your license + license='Apache v2', + + # See https://pypi.python.org/pypi?%3Aaction=list_classifiers + classifiers=[ + # How mature is this project? Common values are + # 3 - Alpha + # 4 - Beta + # 5 - Production/Stable + "Development Status :: 3 - Alpha", + + # Indicate who your project is intended for + "Intended Audience :: Education", + "Intended Audience :: Developers", + "Intended Audience :: Science/Research", + "Intended Audience :: Customer Service", + "Intended Audience :: Financial and Insurance Industry", + "Intended Audience :: Healthcare Industry", + "Intended Audience :: Telecommunications Industry", + "Topic :: Scientific/Engineering :: Artificial Intelligence", + "Topic :: Scientific/Engineering :: Information Analysis", + + # Pick your license as you wish (should match "license" above) + "License :: OSI Approved :: Apache Software License", + + "Programming Language :: Python :: 3.8", + "Programming Language :: Python :: 3.9", + "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", + ], + + keywords='machine learning, data mining, statistical analysis, modeling, big data, distributed, parallel', + + packages=packages, + package_data={"h2o": [ + "version.txt", # version file + "buildinfo.txt" # buildinfo file + ]}, + + # run-time dependencies + install_requires=["h2o-mlops-client>=0.58"] +) From 680f528e19e2ec26fd329134f41823bdb5a272d0 Mon Sep 17 00:00:00 2001 From: Eric Wolf Date: Thu, 19 Oct 2023 21:29:41 +0200 Subject: [PATCH 02/43] First version of mojo flavor by Eric Wolf --- .../h2o_mlflow_flavors/h2o_mojo.py | 414 ++++++++++++++++++ 1 file changed, 414 insertions(+) create mode 100644 h2o-py-mlflow-flavors/h2o_mlflow_flavors/h2o_mojo.py diff --git a/h2o-py-mlflow-flavors/h2o_mlflow_flavors/h2o_mojo.py b/h2o-py-mlflow-flavors/h2o_mlflow_flavors/h2o_mojo.py new file mode 100644 index 000000000000..1ad193266624 --- /dev/null +++ b/h2o-py-mlflow-flavors/h2o_mlflow_flavors/h2o_mojo.py @@ -0,0 +1,414 @@ +""" +The `h2o_mlflow_flavors.h2o_mojo` module provides an API for logging H2O DriverlessAI models. This models +exports H2O models in the following flavors: + +h2o3 + +""" + +import logging +import os +import pickle +import shutil +import uuid +import warnings + +from typing import Any, Dict, Optional + +import numpy as np +import pandas as pd +#import sktime +import yaml +import h2o +import random +import mlflow +import mlflow.h2o +from h2o.estimators.random_forest import H2ORandomForestEstimator +from h2o.estimators.generic import H2OGenericEstimator +from mlflow import pyfunc +from mlflow.exceptions import MlflowException +from mlflow.models import Model +from mlflow.models.model import MLMODEL_FILE_NAME +from mlflow.models.utils import _save_example +from mlflow.protos.databricks_pb2 import INTERNAL_ERROR, INVALID_PARAMETER_VALUE +from mlflow.models.signature import ModelSignature +from mlflow.models.signature import _infer_signature_from_input_example +from mlflow.models.utils import ModelInputExample +from mlflow.tracking._model_registry import DEFAULT_AWAIT_MAX_SLEEP_SECONDS +from mlflow.utils.docstring_utils import LOG_MODEL_PARAM_DOCS, format_docstring +from mlflow.tracking.artifact_utils import _download_artifact_from_uri +from mlflow.utils.environment import ( + _CONDA_ENV_FILE_NAME, + _CONSTRAINTS_FILE_NAME, + _PYTHON_ENV_FILE_NAME, + _REQUIREMENTS_FILE_NAME, + _mlflow_conda_env, + _process_conda_env, + _process_pip_requirements, + _PythonEnv, + _validate_env_arguments, +) + +import h2o_mlflow_flavors +from h2o_mlflow_flavors.utils import match_file_from_name_pattern +from h2o_mlflow_flavors.utils import unzip_specific_file + +from mlflow.utils.file_utils import write_to +from mlflow.utils.model_utils import ( + _add_code_from_conf_to_system_path, + _get_flavor_configuration, + _validate_and_copy_code_paths, + _validate_and_prepare_target_save_path, +) +from mlflow.utils.requirements_utils import _get_pinned_requirement +#from sktime.utils.multiindex import flatten_multiindex +from pysparkling import * +_logger = logging.getLogger(__name__) + + +FLAVOR_NAME = "h2o==3.42.0.3" +H2O3_MODEL_INI = "model.ini" +MLFLOW_H2O3_MOJO_ARTIFACT = "mlflow/h2o_mojo" +MLFLOW_H2O3_MODEL_FILENAME = "h2o_mojo_model.zip" + + + + + +def get_default_pip_requirements(): + """ + :return: A list of default pip requirements for MLflow Models produced by this flavor. + Calls to :func:`save_model()` and :func:`log_model()` produce a pip environment + that, at minimum, contains these requirements. + """ + return [_get_pinned_requirement("h2o")] + + + + +def get_default_conda_env(): + """ + :return: The default Conda environment for MLflow Models produced by calls to + :func:`save_model()` and :func:`log_model()`. + """ + return _mlflow_conda_env(additional_pip_deps=get_default_pip_requirements()) + + + + +#@format_docstring(LOG_MODEL_PARAM_DOCS.format(package_name=FLAVOR_NAME)) +def save_model( + h2o_model, + path, + conda_env=None, + code_paths=None, + mlflow_model=None, + settings=None, + signature: ModelSignature = None, + input_example: ModelInputExample = None, + pip_requirements=None, + extra_pip_requirements=None, + metadata=None, + is_mojo = False +): + """ + Save an H2O model to a path on the local file system. + + :param h2o_model: H2O model to be saved. + :param path: Local path where the model is to be saved. + :param conda_env: {{ conda_env }} + :param code_paths: A list of local filesystem paths to Python file dependencies (or directories + containing file dependencies). These files are *prepended* to the system + path when the model is loaded. + :param mlflow_model: :py:mod:`mlflow.models.Model` this flavor is being added to. + :param signature: {{ signature }} + :param input_example: {{ input_example }} + :param pip_requirements: {{ pip_requirements }} + :param extra_pip_requirements: {{ extra_pip_requirements }} + :param metadata: Custom metadata dictionary passed to the model and stored in the MLmodel file. + + .. Note:: Experimental: This parameter may change or be removed in a future + release without warning. + :param is_mojo: Can save a mojo directly as a h2o_model. If true, calls its own load model + """ + import h2o + if (is_mojo == True): + h2o_model = mlflow.h2o_mojo.load_mojo_model(h2o_model) + + _validate_env_arguments(conda_env, pip_requirements, extra_pip_requirements) + + path = os.path.abspath(path) + _validate_and_prepare_target_save_path(path) + model_data_subpath = "model.h2o" + model_data_path = os.path.join(path, model_data_subpath) + os.makedirs(model_data_path) + code_dir_subpath = _validate_and_copy_code_paths(code_paths, path) + + if signature is None and input_example is not None: + wrapped_model = _H2OModelWrapper(h2o_model) + signature = _infer_signature_from_input_example(input_example, wrapped_model) + elif signature is False: + signature = None + + if mlflow_model is None: + mlflow_model = Model() + if signature is not None: + mlflow_model.signature = signature + if input_example is not None: + _save_example(mlflow_model, input_example, path) + if metadata is not None: + mlflow_model.metadata = metadata + + # Save h2o-model + if hasattr(h2o, "download_model"): + h2o_save_location = h2o.download_model(model=h2o_model, path=model_data_path) + else: + warnings.warn( + "If your cluster is remote, H2O may not store the model correctly. " + "Please upgrade H2O version to a newer version" + ) + h2o_save_location = h2o.save_model(model=h2o_model, path=model_data_path, force=True) + model_file = os.path.basename(h2o_save_location) + + # Save h2o-settings + if settings is None: + settings = {} + settings["full_file"] = h2o_save_location + settings["model_file"] = model_file + settings["model_dir"] = model_data_path + with open(os.path.join(model_data_path, "h2o.yaml"), "w") as settings_file: + yaml.safe_dump(settings, stream=settings_file) + + pyfunc.add_to_model( + mlflow_model, + loader_module="mlflow.h2o", + data=model_data_subpath, + conda_env=_CONDA_ENV_FILE_NAME, + python_env=_PYTHON_ENV_FILE_NAME, + code=code_dir_subpath, + ) + mlflow_model.add_flavor( + FLAVOR_NAME, h2o_version=h2o.__version__, data=model_data_subpath, code=code_dir_subpath + ) + mlflow_model.save(os.path.join(path, MLMODEL_FILE_NAME)) + + if conda_env is None: + if pip_requirements is None: + default_reqs = get_default_pip_requirements() + # To ensure `_load_pyfunc` can successfully load the model during the dependency + # inference, `mlflow_model.save` must be called beforehand to save an MLmodel file. + inferred_reqs = mlflow.models.infer_pip_requirements( + path, + FLAVOR_NAME, + fallback=default_reqs, + ) + default_reqs = sorted(set(inferred_reqs).union(default_reqs)) + else: + default_reqs = None + conda_env, pip_requirements, pip_constraints = _process_pip_requirements( + default_reqs, + pip_requirements, + extra_pip_requirements, + ) + else: + conda_env, pip_requirements, pip_constraints = _process_conda_env(conda_env) + + with open(os.path.join(path, _CONDA_ENV_FILE_NAME), "w") as f: + yaml.safe_dump(conda_env, stream=f, default_flow_style=False) + + # Save `constraints.txt` if necessary + if pip_constraints: + write_to(os.path.join(path, _CONSTRAINTS_FILE_NAME), "\n".join(pip_constraints)) + + # Save `requirements.txt` + write_to(os.path.join(path, _REQUIREMENTS_FILE_NAME), "\n".join(pip_requirements)) + + _PythonEnv.current().to_yaml(os.path.join(path, _PYTHON_ENV_FILE_NAME)) + + + + +#@format_docstring(LOG_MODEL_PARAM_DOCS.format(package_name=FLAVOR_NAME)) +"""def log_model(h2o3_artifact_location, + artifact_path, + h2o3_model_download_location="/tmp/" + str(uuid.uuid1()), + conda_env=None, + registered_model_name=None, + signature: ModelSignature = None, + input_example: ModelInputExample = None, + pip_requirements=None, + extra_pip_requirements=None, + **kwargs, + ): + model_type = _validate_h2o3_model(h2o3_artifact_location) + + h2o3_model_directory = _create_model_file(h2o3_artifact_location, h2o3_model_download_location) + + return Model.log( + artifact_path=artifact_path, + flavor=h2o_mlflow_flavors.h2o3, + registered_model_name=registered_model_name, + h2o3_artifact_location=h2o3_artifact_location, + conda_env=conda_env, + signature=signature, + input_example=input_example, + pip_requirements=pip_requirements, + extra_pip_requirements=extra_pip_requirements, + model_type=model_type, + h2o3_model_directory=h2o3_model_directory, + **kwargs, + ) + +def _validate_h2o3_model(h2o3_model): + if match_file_from_name_pattern(h2o3_model, H2O3_MODEL_INI): + return MLFLOW_H2O3_MOJO_ARTIFACT + else: + raise MlflowException.invalid_parameter_value("The model is not a valid H2O3 MOJO File") + + +def _create_model_file(h2o3_model, h2o_dai_model_download_location): + location = h2o_dai_model_download_location + "/" + model_file_location = location + "model" + unzip_specific_file(h2o3_model, H2O3_MODEL_INI, directory=model_file_location) + dst = shutil.copy(h2o3_model, model_file_location) + os.rename(dst, model_file_location+"/"+MLFLOW_H2O3_MODEL_FILENAME) + return model_file_location""" + +def log_model( + h2o_model, + artifact_path, + conda_env=None, + code_paths=None, + registered_model_name=None, + signature: ModelSignature = None, + input_example: ModelInputExample = None, + pip_requirements=None, + extra_pip_requirements=None, + metadata=None, + **kwargs, +): + """ + Log an H2O model as an MLflow artifact for the current run. + + :param h2o_model: H2O model to be saved. + :param artifact_path: Run-relative artifact path. + :param conda_env: {{ conda_env }} + :param code_paths: A list of local filesystem paths to Python file dependencies (or directories + containing file dependencies). These files are *prepended* to the system + path when the model is loaded. + :param registered_model_name: If given, create a model version under + ``registered_model_name``, also creating a registered model if one + with the given name does not exist. + + :param signature: {{ signature }} + :param input_example: {{ input_example }} + :param pip_requirements: {{ pip_requirements }} + :param extra_pip_requirements: {{ extra_pip_requirements }} + :param metadata: Custom metadata dictionary passed to the model and stored in the MLmodel file. + + .. Note:: Experimental: This parameter may change or be removed in a future + release without warning. + :param kwargs: kwargs to pass to ``h2o.save_model`` method. + :return: A :py:class:`ModelInfo ` instance that contains the + metadata of the logged model. + """ + return Model.log( + artifact_path=artifact_path, + flavor=mlflow.h2o, + registered_model_name=registered_model_name, + h2o_model=h2o_model, + conda_env=conda_env, + code_paths=code_paths, + signature=signature, + input_example=input_example, + pip_requirements=pip_requirements, + extra_pip_requirements=extra_pip_requirements, + metadata=metadata, + **kwargs, + ) + + + +def _load_model(path, init=False): + import h2o + + path = os.path.abspath(path) + with open(os.path.join(path, "h2o.yaml")) as f: + params = yaml.safe_load(f.read()) + if init: + h2o.init(**(params["init"] if "init" in params else {})) + h2o.no_progress() + + model_path = os.path.join(path, params["model_file"]) + if hasattr(h2o, "upload_model"): + model = h2o.upload_model(model_path) + else: + warnings.warn( + "If your cluster is remote, H2O may not load the model correctly. " + "Please upgrade H2O version to a newer version" + ) + model = h2o.load_model(model_path) + + return model + + +class _H2OModelWrapper: + def __init__(self, h2o_model): + self.h2o_model = h2o_model + + def predict( + self, dataframe, params: Optional[Dict[str, Any]] = None + ): # pylint: disable=unused-argument + """ + :param dataframe: Model input data. + :param params: Additional parameters to pass to the model for inference. + + .. Note:: Experimental: This parameter may change or be removed in a future + release without warning. + + :return: Model predictions. + """ + import h2o + + predicted = self.h2o_model.predict(h2o.H2OFrame(dataframe)).as_data_frame() + predicted.index = dataframe.index + return predicted + + +def _load_pyfunc(path): + """ + Load PyFunc implementation. Called by ``pyfunc.load_model``. + + :param path: Local filesystem path to the MLflow Model with the ``h2o`` flavor. + """ + return _H2OModelWrapper(_load_model(path, init=True)) + + +def load_mojo_model(mojo_path, model_id=None,estimator=None): + """ + Uploads an existing MOJO model from local filesystem into H2O and imports it as an H2O Generic Model. + + :param mojo_path: Path to the MOJO archive on the user's local filesystem + :param model_id: Model ID, default None + :param estimator: uses H2OGenericEstimator on default None. ,estimator=None + :return: An H2OGenericEstimator instance embedding given MOJO + """ + hc = H2OContext.getOrCreate() + if mojo_path is None: + raise TypeError("MOJO path may not be None") + if estimator != None: + mojo_estimator = estimator(mojo_path,model_id) + else: + mojo_estimator = H2OGenericEstimator.from_file(mojo_path, model_id) + return mojo_estimator + +""" if mojo_path is None: + raise TypeError("MOJO path may not be None") + hc = H2OContext.getOrCreate() + #original_model_filename = model.download_mojo(model_uri) + return h2o.import_mojo(mojo_path)""" +""" if estimator is None: + return h2o.import_mojo(mojo_path, dst_path=None) + else: + response = api("POST /3/PostFile", filename=mojo_path) + frame_key = response["destination_frame"]""" From 6431595928991afad7f242591842d71c4bd6684b Mon Sep 17 00:00:00 2001 From: Marek Novotny Date: Fri, 20 Oct 2023 21:45:31 +0200 Subject: [PATCH 03/43] New saving method --- .../h2o_mlflow_flavors/__init__.py | 0 .../h2o_mlflow_flavors/h2o_mojo.py | 199 ++---------------- 2 files changed, 18 insertions(+), 181 deletions(-) create mode 100644 h2o-py-mlflow-flavors/h2o_mlflow_flavors/__init__.py diff --git a/h2o-py-mlflow-flavors/h2o_mlflow_flavors/__init__.py b/h2o-py-mlflow-flavors/h2o_mlflow_flavors/__init__.py new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/h2o-py-mlflow-flavors/h2o_mlflow_flavors/h2o_mojo.py b/h2o-py-mlflow-flavors/h2o_mlflow_flavors/h2o_mojo.py index 1ad193266624..214208a79d63 100644 --- a/h2o-py-mlflow-flavors/h2o_mlflow_flavors/h2o_mojo.py +++ b/h2o-py-mlflow-flavors/h2o_mlflow_flavors/h2o_mojo.py @@ -8,35 +8,22 @@ import logging import os -import pickle -import shutil -import uuid import warnings from typing import Any, Dict, Optional -import numpy as np -import pandas as pd -#import sktime import yaml -import h2o -import random + import mlflow import mlflow.h2o -from h2o.estimators.random_forest import H2ORandomForestEstimator from h2o.estimators.generic import H2OGenericEstimator from mlflow import pyfunc -from mlflow.exceptions import MlflowException from mlflow.models import Model from mlflow.models.model import MLMODEL_FILE_NAME from mlflow.models.utils import _save_example -from mlflow.protos.databricks_pb2 import INTERNAL_ERROR, INVALID_PARAMETER_VALUE from mlflow.models.signature import ModelSignature from mlflow.models.signature import _infer_signature_from_input_example from mlflow.models.utils import ModelInputExample -from mlflow.tracking._model_registry import DEFAULT_AWAIT_MAX_SLEEP_SECONDS -from mlflow.utils.docstring_utils import LOG_MODEL_PARAM_DOCS, format_docstring -from mlflow.tracking.artifact_utils import _download_artifact_from_uri from mlflow.utils.environment import ( _CONDA_ENV_FILE_NAME, _CONSTRAINTS_FILE_NAME, @@ -49,10 +36,6 @@ _validate_env_arguments, ) -import h2o_mlflow_flavors -from h2o_mlflow_flavors.utils import match_file_from_name_pattern -from h2o_mlflow_flavors.utils import unzip_specific_file - from mlflow.utils.file_utils import write_to from mlflow.utils.model_utils import ( _add_code_from_conf_to_system_path, @@ -61,20 +44,16 @@ _validate_and_prepare_target_save_path, ) from mlflow.utils.requirements_utils import _get_pinned_requirement -#from sktime.utils.multiindex import flatten_multiindex from pysparkling import * _logger = logging.getLogger(__name__) -FLAVOR_NAME = "h2o==3.42.0.3" +FLAVOR_NAME = "h2o_mojo" H2O3_MODEL_INI = "model.ini" MLFLOW_H2O3_MOJO_ARTIFACT = "mlflow/h2o_mojo" MLFLOW_H2O3_MODEL_FILENAME = "h2o_mojo_model.zip" - - - def get_default_pip_requirements(): """ :return: A list of default pip requirements for MLflow Models produced by this flavor. @@ -84,8 +63,6 @@ def get_default_pip_requirements(): return [_get_pinned_requirement("h2o")] - - def get_default_conda_env(): """ :return: The default Conda environment for MLflow Models produced by calls to @@ -93,122 +70,60 @@ def get_default_conda_env(): """ return _mlflow_conda_env(additional_pip_deps=get_default_pip_requirements()) - - - -#@format_docstring(LOG_MODEL_PARAM_DOCS.format(package_name=FLAVOR_NAME)) def save_model( h2o_model, path, conda_env=None, code_paths=None, mlflow_model=None, - settings=None, - signature: ModelSignature = None, - input_example: ModelInputExample = None, + signature=None, + input_example=None, pip_requirements=None, extra_pip_requirements=None, - metadata=None, - is_mojo = False ): - """ - Save an H2O model to a path on the local file system. - - :param h2o_model: H2O model to be saved. - :param path: Local path where the model is to be saved. - :param conda_env: {{ conda_env }} - :param code_paths: A list of local filesystem paths to Python file dependencies (or directories - containing file dependencies). These files are *prepended* to the system - path when the model is loaded. - :param mlflow_model: :py:mod:`mlflow.models.Model` this flavor is being added to. - :param signature: {{ signature }} - :param input_example: {{ input_example }} - :param pip_requirements: {{ pip_requirements }} - :param extra_pip_requirements: {{ extra_pip_requirements }} - :param metadata: Custom metadata dictionary passed to the model and stored in the MLmodel file. - - .. Note:: Experimental: This parameter may change or be removed in a future - release without warning. - :param is_mojo: Can save a mojo directly as a h2o_model. If true, calls its own load model - """ import h2o - if (is_mojo == True): - h2o_model = mlflow.h2o_mojo.load_mojo_model(h2o_model) - + _validate_env_arguments(conda_env, pip_requirements, extra_pip_requirements) - - path = os.path.abspath(path) _validate_and_prepare_target_save_path(path) - model_data_subpath = "model.h2o" - model_data_path = os.path.join(path, model_data_subpath) - os.makedirs(model_data_path) code_dir_subpath = _validate_and_copy_code_paths(code_paths, path) - if signature is None and input_example is not None: - wrapped_model = _H2OModelWrapper(h2o_model) - signature = _infer_signature_from_input_example(input_example, wrapped_model) - elif signature is False: - signature = None - if mlflow_model is None: mlflow_model = Model() if signature is not None: mlflow_model.signature = signature if input_example is not None: _save_example(mlflow_model, input_example, path) - if metadata is not None: - mlflow_model.metadata = metadata - - # Save h2o-model - if hasattr(h2o, "download_model"): - h2o_save_location = h2o.download_model(model=h2o_model, path=model_data_path) - else: - warnings.warn( - "If your cluster is remote, H2O may not store the model correctly. " - "Please upgrade H2O version to a newer version" - ) - h2o_save_location = h2o.save_model(model=h2o_model, path=model_data_path, force=True) - model_file = os.path.basename(h2o_save_location) - - # Save h2o-settings - if settings is None: - settings = {} - settings["full_file"] = h2o_save_location - settings["model_file"] = model_file - settings["model_dir"] = model_data_path - with open(os.path.join(model_data_path, "h2o.yaml"), "w") as settings_file: - yaml.safe_dump(settings, stream=settings_file) - + + h2o_model.download_mojo(path=path, get_genmodel_jar=True) + pyfunc.add_to_model( mlflow_model, - loader_module="mlflow.h2o", - data=model_data_subpath, + loader_module="flavor", + model_path=model_data_subpath, conda_env=_CONDA_ENV_FILE_NAME, python_env=_PYTHON_ENV_FILE_NAME, code=code_dir_subpath, ) + mlflow_model.add_flavor( - FLAVOR_NAME, h2o_version=h2o.__version__, data=model_data_subpath, code=code_dir_subpath + FLAVOR_NAME, + pickled_model=model_data_subpath, + h2o_version=h2o.__version__, + code=code_dir_subpath, ) mlflow_model.save(os.path.join(path, MLMODEL_FILE_NAME)) if conda_env is None: if pip_requirements is None: default_reqs = get_default_pip_requirements() - # To ensure `_load_pyfunc` can successfully load the model during the dependency - # inference, `mlflow_model.save` must be called beforehand to save an MLmodel file. inferred_reqs = mlflow.models.infer_pip_requirements( - path, - FLAVOR_NAME, - fallback=default_reqs, + path, FLAVOR_NAME, fallback=default_reqs ) default_reqs = sorted(set(inferred_reqs).union(default_reqs)) else: default_reqs = None conda_env, pip_requirements, pip_constraints = _process_pip_requirements( - default_reqs, - pip_requirements, - extra_pip_requirements, + default_reqs, pip_requirements, extra_pip_requirements ) else: conda_env, pip_requirements, pip_constraints = _process_conda_env(conda_env) @@ -216,64 +131,15 @@ def save_model( with open(os.path.join(path, _CONDA_ENV_FILE_NAME), "w") as f: yaml.safe_dump(conda_env, stream=f, default_flow_style=False) - # Save `constraints.txt` if necessary if pip_constraints: write_to(os.path.join(path, _CONSTRAINTS_FILE_NAME), "\n".join(pip_constraints)) - # Save `requirements.txt` write_to(os.path.join(path, _REQUIREMENTS_FILE_NAME), "\n".join(pip_requirements)) _PythonEnv.current().to_yaml(os.path.join(path, _PYTHON_ENV_FILE_NAME)) - -#@format_docstring(LOG_MODEL_PARAM_DOCS.format(package_name=FLAVOR_NAME)) -"""def log_model(h2o3_artifact_location, - artifact_path, - h2o3_model_download_location="/tmp/" + str(uuid.uuid1()), - conda_env=None, - registered_model_name=None, - signature: ModelSignature = None, - input_example: ModelInputExample = None, - pip_requirements=None, - extra_pip_requirements=None, - **kwargs, - ): - model_type = _validate_h2o3_model(h2o3_artifact_location) - - h2o3_model_directory = _create_model_file(h2o3_artifact_location, h2o3_model_download_location) - - return Model.log( - artifact_path=artifact_path, - flavor=h2o_mlflow_flavors.h2o3, - registered_model_name=registered_model_name, - h2o3_artifact_location=h2o3_artifact_location, - conda_env=conda_env, - signature=signature, - input_example=input_example, - pip_requirements=pip_requirements, - extra_pip_requirements=extra_pip_requirements, - model_type=model_type, - h2o3_model_directory=h2o3_model_directory, - **kwargs, - ) - -def _validate_h2o3_model(h2o3_model): - if match_file_from_name_pattern(h2o3_model, H2O3_MODEL_INI): - return MLFLOW_H2O3_MOJO_ARTIFACT - else: - raise MlflowException.invalid_parameter_value("The model is not a valid H2O3 MOJO File") - - -def _create_model_file(h2o3_model, h2o_dai_model_download_location): - location = h2o_dai_model_download_location + "/" - model_file_location = location + "model" - unzip_specific_file(h2o3_model, H2O3_MODEL_INI, directory=model_file_location) - dst = shutil.copy(h2o3_model, model_file_location) - os.rename(dst, model_file_location+"/"+MLFLOW_H2O3_MODEL_FILENAME) - return model_file_location""" - def log_model( h2o_model, artifact_path, @@ -314,7 +180,7 @@ def log_model( """ return Model.log( artifact_path=artifact_path, - flavor=mlflow.h2o, + flavor=h2o_mlflow_flavors.h2o_mojo, registered_model_name=registered_model_name, h2o_model=h2o_model, conda_env=conda_env, @@ -383,32 +249,3 @@ def _load_pyfunc(path): """ return _H2OModelWrapper(_load_model(path, init=True)) - -def load_mojo_model(mojo_path, model_id=None,estimator=None): - """ - Uploads an existing MOJO model from local filesystem into H2O and imports it as an H2O Generic Model. - - :param mojo_path: Path to the MOJO archive on the user's local filesystem - :param model_id: Model ID, default None - :param estimator: uses H2OGenericEstimator on default None. ,estimator=None - :return: An H2OGenericEstimator instance embedding given MOJO - """ - hc = H2OContext.getOrCreate() - if mojo_path is None: - raise TypeError("MOJO path may not be None") - if estimator != None: - mojo_estimator = estimator(mojo_path,model_id) - else: - mojo_estimator = H2OGenericEstimator.from_file(mojo_path, model_id) - return mojo_estimator - -""" if mojo_path is None: - raise TypeError("MOJO path may not be None") - hc = H2OContext.getOrCreate() - #original_model_filename = model.download_mojo(model_uri) - return h2o.import_mojo(mojo_path)""" -""" if estimator is None: - return h2o.import_mojo(mojo_path, dst_path=None) - else: - response = api("POST /3/PostFile", filename=mojo_path) - frame_key = response["destination_frame"]""" From 1bd9c021a0ba13ac32cb5565c177aedb7c9a37e7 Mon Sep 17 00:00:00 2001 From: Marek Novotny Date: Mon, 23 Oct 2023 12:04:54 +0200 Subject: [PATCH 04/43] Update loading mojos --- .../h2o_mlflow_flavors/h2o_mojo.py | 43 +++++++------------ 1 file changed, 15 insertions(+), 28 deletions(-) diff --git a/h2o-py-mlflow-flavors/h2o_mlflow_flavors/h2o_mojo.py b/h2o-py-mlflow-flavors/h2o_mlflow_flavors/h2o_mojo.py index 214208a79d63..c3f674cc58b3 100644 --- a/h2o-py-mlflow-flavors/h2o_mlflow_flavors/h2o_mojo.py +++ b/h2o-py-mlflow-flavors/h2o_mlflow_flavors/h2o_mojo.py @@ -8,21 +8,15 @@ import logging import os -import warnings - -from typing import Any, Dict, Optional import yaml import mlflow -import mlflow.h2o -from h2o.estimators.generic import H2OGenericEstimator from mlflow import pyfunc from mlflow.models import Model from mlflow.models.model import MLMODEL_FILE_NAME from mlflow.models.utils import _save_example from mlflow.models.signature import ModelSignature -from mlflow.models.signature import _infer_signature_from_input_example from mlflow.models.utils import ModelInputExample from mlflow.utils.environment import ( _CONDA_ENV_FILE_NAME, @@ -44,15 +38,12 @@ _validate_and_prepare_target_save_path, ) from mlflow.utils.requirements_utils import _get_pinned_requirement -from pysparkling import * -_logger = logging.getLogger(__name__) +from mlflow.tracking.artifact_utils import _download_artifact_from_uri -FLAVOR_NAME = "h2o_mojo" -H2O3_MODEL_INI = "model.ini" -MLFLOW_H2O3_MOJO_ARTIFACT = "mlflow/h2o_mojo" -MLFLOW_H2O3_MODEL_FILENAME = "h2o_mojo_model.zip" +_logger = logging.getLogger(__name__) +FLAVOR_NAME = "h2o_mojo" def get_default_pip_requirements(): """ @@ -93,8 +84,8 @@ def save_model( mlflow_model.signature = signature if input_example is not None: _save_example(mlflow_model, input_example, path) - - h2o_model.download_mojo(path=path, get_genmodel_jar=True) + + model_data_subpath = h2o_model.download_mojo(path=path, get_genmodel_jar=True) pyfunc.add_to_model( mlflow_model, @@ -107,7 +98,7 @@ def save_model( mlflow_model.add_flavor( FLAVOR_NAME, - pickled_model=model_data_subpath, + model_file=model_data_subpath, h2o_version=h2o.__version__, code=code_dir_subpath, ) @@ -139,7 +130,6 @@ def save_model( _PythonEnv.current().to_yaml(os.path.join(path, _PYTHON_ENV_FILE_NAME)) - def log_model( h2o_model, artifact_path, @@ -194,6 +184,12 @@ def log_model( ) +def load_model(model_uri, dst_path=None): + local_model_path = _download_artifact_from_uri( + artifact_uri=model_uri, output_path=dst_path + ) + return _H2OModelWrapper(_load_model(path, init=True)) + def _load_model(path, init=False): import h2o @@ -202,18 +198,11 @@ def _load_model(path, init=False): with open(os.path.join(path, "h2o.yaml")) as f: params = yaml.safe_load(f.read()) if init: - h2o.init(**(params["init"] if "init" in params else {})) + h2o.init() h2o.no_progress() model_path = os.path.join(path, params["model_file"]) - if hasattr(h2o, "upload_model"): - model = h2o.upload_model(model_path) - else: - warnings.warn( - "If your cluster is remote, H2O may not load the model correctly. " - "Please upgrade H2O version to a newer version" - ) - model = h2o.load_model(model_path) + model = h2o.import_mojo(model_path) return model @@ -222,9 +211,7 @@ class _H2OModelWrapper: def __init__(self, h2o_model): self.h2o_model = h2o_model - def predict( - self, dataframe, params: Optional[Dict[str, Any]] = None - ): # pylint: disable=unused-argument + def predict(self, dataframe, params=None): """ :param dataframe: Model input data. :param params: Additional parameters to pass to the model for inference. From 5862564a5bc567e4b9858f7b1ad17744c0d8c6b3 Mon Sep 17 00:00:00 2001 From: Marek Novotny Date: Mon, 23 Oct 2023 12:46:47 +0200 Subject: [PATCH 05/43] Fix upload mojos. --- h2o-py-mlflow-flavors/h2o_mlflow_flavors/h2o_mojo.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/h2o-py-mlflow-flavors/h2o_mlflow_flavors/h2o_mojo.py b/h2o-py-mlflow-flavors/h2o_mlflow_flavors/h2o_mojo.py index c3f674cc58b3..3a10c237cef1 100644 --- a/h2o-py-mlflow-flavors/h2o_mlflow_flavors/h2o_mojo.py +++ b/h2o-py-mlflow-flavors/h2o_mlflow_flavors/h2o_mojo.py @@ -185,7 +185,7 @@ def log_model( def load_model(model_uri, dst_path=None): - local_model_path = _download_artifact_from_uri( + path = _download_artifact_from_uri( artifact_uri=model_uri, output_path=dst_path ) return _H2OModelWrapper(_load_model(path, init=True)) From b969f1f2156c057eb2fd8b95ac31fef304d56cb6 Mon Sep 17 00:00:00 2001 From: Marek Novotny Date: Mon, 23 Oct 2023 14:47:48 +0200 Subject: [PATCH 06/43] Fix building --- build.gradle | 3 ++- h2o-py-mlflow-flavors/build.gradle | 10 +++++----- h2o-py-mlflow-flavors/setup.cfg | 2 +- h2o-py-mlflow-flavors/setup.py | 2 +- settings.gradle | 1 + 5 files changed, 10 insertions(+), 8 deletions(-) diff --git a/build.gradle b/build.gradle index f816fc2d62c5..59cf91e57428 100644 --- a/build.gradle +++ b/build.gradle @@ -155,7 +155,8 @@ ext { pythonProjects = [ project(':h2o-py'), - project(':h2o-py-cloud-extensions') + project(':h2o-py-cloud-extensions'), + project(':h2o-py-mlflow-flavors') ] // The project which need to be run under CI only diff --git a/h2o-py-mlflow-flavors/build.gradle b/h2o-py-mlflow-flavors/build.gradle index b6689e594e62..8b374cdbcd50 100644 --- a/h2o-py-mlflow-flavors/build.gradle +++ b/h2o-py-mlflow-flavors/build.gradle @@ -20,11 +20,11 @@ ext { // task createVersionFiles() { doLast { - file("${buildDir}/h2o/mlflow/").mkdirs() - File version_file = new File("${buildDir}/h2o-mlflow-flavors/", "version.txt") + file("${buildDir}/h2o_mlflow_flavors/").mkdirs() + File version_file = new File("${buildDir}/h2o_mlflow_flavors/", "version.txt") version_file.write(PROJECT_VERSION) - File build_file = new File("${buildDir}/h2o-mlflow-flavors/", "buildinfo.txt") + File build_file = new File("${buildDir}/h2o_mlflow_flavors/", "buildinfo.txt") build_file.write(buildVersion.toString()) } } @@ -34,7 +34,7 @@ task copySrcFiles(type: Copy) { from ("${projectDir}") { include "setup.py" include "setup.cfg" - include "h2o-mlflow-flavors/**" + include "h2o_mlflow_flavors/**" include "README.md" include "DESCRIPTION.rst" } @@ -45,7 +45,7 @@ task buildDist(type: Exec, dependsOn: [createVersionFiles, copySrcFiles]) { workingDir buildDir doFirst { file("${buildDir}/tmp").mkdirs() - standardOutput = new FileOutputStream(file("${buildDir}/tmp/h2o-mlflow-flavors_buildDist.out")) + standardOutput = new FileOutputStream(file("${buildDir}/tmp/h2o_mlflow_flavors_buildDist.out")) } commandLine getOsSpecificCommandLine([pythonexe, "setup.py", "bdist_wheel"]) } diff --git a/h2o-py-mlflow-flavors/setup.cfg b/h2o-py-mlflow-flavors/setup.cfg index b4f33f8e0459..29a557ba78a8 100644 --- a/h2o-py-mlflow-flavors/setup.cfg +++ b/h2o-py-mlflow-flavors/setup.cfg @@ -12,7 +12,7 @@ # ignore = E241,E265,E302,E303,E701,D105 max-line-length = 120 -application-import-names = h2o-mlflow-flavors +application-import-names = h2o_mlflow_flavors import-order-style = smarkets inline-quotes = " diff --git a/h2o-py-mlflow-flavors/setup.py b/h2o-py-mlflow-flavors/setup.py index 68d682376f09..e274594e3110 100644 --- a/h2o-py-mlflow-flavors/setup.py +++ b/h2o-py-mlflow-flavors/setup.py @@ -77,5 +77,5 @@ ]}, # run-time dependencies - install_requires=["h2o-mlops-client>=0.58"] + install_requires=["mlflow>=1.29.0"] ) diff --git a/settings.gradle b/settings.gradle index f7e8a05d05c2..fcb368884731 100644 --- a/settings.gradle +++ b/settings.gradle @@ -10,6 +10,7 @@ include 'h2o-app' include 'h2o-r' include 'h2o-py' include 'h2o-py-cloud-extensions' +include 'h2o-py-mlflow-flavors' include 'h2o-assemblies:main' include 'h2o-assemblies:minimal' include 'h2o-assemblies:steam' From e038dbe42368c5127806ba9b220df8496d00780a Mon Sep 17 00:00:00 2001 From: Marek Novotny Date: Mon, 23 Oct 2023 19:17:23 +0200 Subject: [PATCH 07/43] Fix h2o_mlflow_flavor --- h2o-py-mlflow-flavors/h2o_mlflow_flavors/h2o_mojo.py | 1 + 1 file changed, 1 insertion(+) diff --git a/h2o-py-mlflow-flavors/h2o_mlflow_flavors/h2o_mojo.py b/h2o-py-mlflow-flavors/h2o_mlflow_flavors/h2o_mojo.py index 3a10c237cef1..3ae1fcf3f820 100644 --- a/h2o-py-mlflow-flavors/h2o_mlflow_flavors/h2o_mojo.py +++ b/h2o-py-mlflow-flavors/h2o_mlflow_flavors/h2o_mojo.py @@ -168,6 +168,7 @@ def log_model( :return: A :py:class:`ModelInfo ` instance that contains the metadata of the logged model. """ + import h2o_mlflow_flavors return Model.log( artifact_path=artifact_path, flavor=h2o_mlflow_flavors.h2o_mojo, From 0a32fb826953c76638e35f9c5c11bbcdec2e97a5 Mon Sep 17 00:00:00 2001 From: Marek Novotny Date: Tue, 31 Oct 2023 14:24:44 +0100 Subject: [PATCH 08/43] Update --- h2o-py-mlflow-flavors/h2o_mlflow_flavors/h2o_mojo.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/h2o-py-mlflow-flavors/h2o_mlflow_flavors/h2o_mojo.py b/h2o-py-mlflow-flavors/h2o_mlflow_flavors/h2o_mojo.py index 3ae1fcf3f820..dc2bffa7de7f 100644 --- a/h2o-py-mlflow-flavors/h2o_mlflow_flavors/h2o_mojo.py +++ b/h2o-py-mlflow-flavors/h2o_mlflow_flavors/h2o_mojo.py @@ -1,7 +1,5 @@ """ -The `h2o_mlflow_flavors.h2o_mojo` module provides an API for logging H2O DriverlessAI models. This models -exports H2O models in the following flavors: - +The `h2o_mlflow_flavors.h2o_mojo` module provides an API for working with H2O MOJO models. h2o3 """ @@ -89,7 +87,7 @@ def save_model( pyfunc.add_to_model( mlflow_model, - loader_module="flavor", + loader_module=FLAVOR_NAME, model_path=model_data_subpath, conda_env=_CONDA_ENV_FILE_NAME, python_env=_PYTHON_ENV_FILE_NAME, @@ -223,7 +221,7 @@ def predict(self, dataframe, params=None): :return: Model predictions. """ import h2o - + predicted = self.h2o_model.predict(h2o.H2OFrame(dataframe)).as_data_frame() predicted.index = dataframe.index return predicted @@ -237,3 +235,4 @@ def _load_pyfunc(path): """ return _H2OModelWrapper(_load_model(path, init=True)) + From ae3336c8be258975da32c34873943f96a4669cbc Mon Sep 17 00:00:00 2001 From: Marek Novotny Date: Tue, 31 Oct 2023 14:35:00 +0100 Subject: [PATCH 09/43] Update loader module --- h2o-py-mlflow-flavors/h2o_mlflow_flavors/h2o_mojo.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/h2o-py-mlflow-flavors/h2o_mlflow_flavors/h2o_mojo.py b/h2o-py-mlflow-flavors/h2o_mlflow_flavors/h2o_mojo.py index dc2bffa7de7f..db6e91af47a9 100644 --- a/h2o-py-mlflow-flavors/h2o_mlflow_flavors/h2o_mojo.py +++ b/h2o-py-mlflow-flavors/h2o_mlflow_flavors/h2o_mojo.py @@ -87,7 +87,7 @@ def save_model( pyfunc.add_to_model( mlflow_model, - loader_module=FLAVOR_NAME, + loader_module="h2o_mlflow_flavors.h2o_mojo", model_path=model_data_subpath, conda_env=_CONDA_ENV_FILE_NAME, python_env=_PYTHON_ENV_FILE_NAME, From 8c9bb69b5a529cb13891d6ab19b11955cd4dc91a Mon Sep 17 00:00:00 2001 From: Marek Novotny Date: Wed, 1 Nov 2023 19:08:56 +0100 Subject: [PATCH 10/43] Update h2o_mojo --- .../h2o_mlflow_flavors/h2o_mojo.py | 18 +++++++----------- 1 file changed, 7 insertions(+), 11 deletions(-) diff --git a/h2o-py-mlflow-flavors/h2o_mlflow_flavors/h2o_mojo.py b/h2o-py-mlflow-flavors/h2o_mlflow_flavors/h2o_mojo.py index db6e91af47a9..d1314e32511b 100644 --- a/h2o-py-mlflow-flavors/h2o_mlflow_flavors/h2o_mojo.py +++ b/h2o-py-mlflow-flavors/h2o_mlflow_flavors/h2o_mojo.py @@ -83,12 +83,13 @@ def save_model( if input_example is not None: _save_example(mlflow_model, input_example, path) - model_data_subpath = h2o_model.download_mojo(path=path, get_genmodel_jar=True) + model_data_path = h2o_model.download_mojo(path=path, get_genmodel_jar=True) + model_file = os.path.basename(model_data_path) pyfunc.add_to_model( mlflow_model, loader_module="h2o_mlflow_flavors.h2o_mojo", - model_path=model_data_subpath, + model_path=model_file, conda_env=_CONDA_ENV_FILE_NAME, python_env=_PYTHON_ENV_FILE_NAME, code=code_dir_subpath, @@ -96,7 +97,7 @@ def save_model( mlflow_model.add_flavor( FLAVOR_NAME, - model_file=model_data_subpath, + model_file=model_file, h2o_version=h2o.__version__, code=code_dir_subpath, ) @@ -193,14 +194,12 @@ def load_model(model_uri, dst_path=None): def _load_model(path, init=False): import h2o - path = os.path.abspath(path) - with open(os.path.join(path, "h2o.yaml")) as f: - params = yaml.safe_load(f.read()) if init: - h2o.init() + h2o.init(strict_version_check=False) h2o.no_progress() - model_path = os.path.join(path, params["model_file"]) + flavor_conf = _get_flavor_configuration(model_path=path, flavor_name=FLAVOR_NAME) + model_path = os.path.join(path, flavor_conf["model_file"]) model = h2o.import_mojo(model_path) return model @@ -215,9 +214,6 @@ def predict(self, dataframe, params=None): :param dataframe: Model input data. :param params: Additional parameters to pass to the model for inference. - .. Note:: Experimental: This parameter may change or be removed in a future - release without warning. - :return: Model predictions. """ import h2o From 6050e6499ddb26873dd29dd1fed38a72757d857b Mon Sep 17 00:00:00 2001 From: Marek Novotny Date: Fri, 3 Nov 2023 13:41:16 +0100 Subject: [PATCH 11/43] Add genmodel flavor --- .../h2o_mlflow_flavors/h2o_gen_model.py | 270 ++++++++++++++++++ 1 file changed, 270 insertions(+) create mode 100644 h2o-py-mlflow-flavors/h2o_mlflow_flavors/h2o_gen_model.py diff --git a/h2o-py-mlflow-flavors/h2o_mlflow_flavors/h2o_gen_model.py b/h2o-py-mlflow-flavors/h2o_mlflow_flavors/h2o_gen_model.py new file mode 100644 index 000000000000..c323ec8d6f57 --- /dev/null +++ b/h2o-py-mlflow-flavors/h2o_mlflow_flavors/h2o_gen_model.py @@ -0,0 +1,270 @@ +""" +The `h2o_mlflow_flavors.h2o_gen_model` module provides an API for working with H2O MOJO and POJO models. +""" + +import logging +import os +import tempfile +import pandas +import subprocess +import sys + +import yaml + +import mlflow +from mlflow import pyfunc +from mlflow.models import Model +from mlflow.models.model import MLMODEL_FILE_NAME +from mlflow.models.utils import _save_example +from mlflow.models.signature import ModelSignature +from mlflow.models.utils import ModelInputExample +from mlflow.utils.environment import ( + _CONDA_ENV_FILE_NAME, + _CONSTRAINTS_FILE_NAME, + _PYTHON_ENV_FILE_NAME, + _REQUIREMENTS_FILE_NAME, + _mlflow_conda_env, + _process_conda_env, + _process_pip_requirements, + _PythonEnv, + _validate_env_arguments, +) + +from mlflow.utils.file_utils import write_to +from mlflow.utils.model_utils import ( + _add_code_from_conf_to_system_path, + _get_flavor_configuration, + _validate_and_copy_code_paths, + _validate_and_prepare_target_save_path, +) +from mlflow.utils.requirements_utils import _get_pinned_requirement +from mlflow.tracking.artifact_utils import _download_artifact_from_uri + + +_logger = logging.getLogger(__name__) + +FLAVOR_NAME = "h2o_gen_model" + + +def get_default_pip_requirements(): + """ + :return: A list of default pip requirements for MLflow Models produced by this flavor. + Calls to :func:`save_model()` and :func:`log_model()` produce a pip environment + that, at minimum, contains these requirements. + """ + return [] + + +def get_default_conda_env(): + """ + :return: The default Conda environment for MLflow Models produced by calls to + :func:`save_model()` and :func:`log_model()`. + """ + return _mlflow_conda_env(additional_pip_deps=get_default_pip_requirements()) + + +def save_model( + h2o_model, + path, + conda_env=None, + code_paths=None, + mlflow_model=None, + signature=None, + input_example=None, + pip_requirements=None, + extra_pip_requirements=None, + model_type="MOJO" +): + import h2o + + model_type_upper = model_type.upper() + if model_type_upper != "MOJO" and model_type_upper != "POJO": + raise ValueError(f"The `model_type` parameter must be 'MOJO' or 'POJO'. The passed value was '{model_type}'.") + + _validate_env_arguments(conda_env, pip_requirements, extra_pip_requirements) + _validate_and_prepare_target_save_path(path) + code_dir_subpath = _validate_and_copy_code_paths(code_paths, path) + + if mlflow_model is None: + mlflow_model = Model() + if signature is not None: + mlflow_model.signature = signature + if input_example is not None: + _save_example(mlflow_model, input_example, path) + + if model_type_upper == "MOJO": + model_data_path = h2o_model.download_mojo(path=path, get_genmodel_jar=True) + model_file = os.path.basename(model_data_path) + else: + model_data_path = h2o_model.downlaod_pojo(path=path, get_genmodel_jar=True) + javac_cmd = ["javac", "-cp", h2o_genmodel_jar, "-J-Xmx12g", java_file] + subprocess.check_call(javac_cmd) + model_file = os.path.basename(model_data_path).replace(".java", ".class") + + pyfunc.add_to_model( + mlflow_model, + loader_module="h2o_mlflow_flavors.h2o_gen_model", + model_path=model_file, + conda_env=_CONDA_ENV_FILE_NAME, + python_env=_PYTHON_ENV_FILE_NAME, + code=code_dir_subpath, + ) + + mlflow_model.add_flavor( + FLAVOR_NAME, + model_file=model_file, + model_type=model_type_upper, + h2o_version=h2o.__version__, + code=code_dir_subpath, + ) + mlflow_model.save(os.path.join(path, MLMODEL_FILE_NAME)) + + if conda_env is None: + if pip_requirements is None: + default_reqs = get_default_pip_requirements() + inferred_reqs = mlflow.models.infer_pip_requirements( + path, FLAVOR_NAME, fallback=default_reqs + ) + default_reqs = sorted(set(inferred_reqs).union(default_reqs)) + else: + default_reqs = None + conda_env, pip_requirements, pip_constraints = _process_pip_requirements( + default_reqs, pip_requirements, extra_pip_requirements + ) + else: + conda_env, pip_requirements, pip_constraints = _process_conda_env(conda_env) + + with open(os.path.join(path, _CONDA_ENV_FILE_NAME), "w") as f: + yaml.safe_dump(conda_env, stream=f, default_flow_style=False) + + if pip_constraints: + write_to(os.path.join(path, _CONSTRAINTS_FILE_NAME), "\n".join(pip_constraints)) + + write_to(os.path.join(path, _REQUIREMENTS_FILE_NAME), "\n".join(pip_requirements)) + + _PythonEnv.current().to_yaml(os.path.join(path, _PYTHON_ENV_FILE_NAME)) + + +def log_model( + h2o_model, + artifact_path, + conda_env=None, + code_paths=None, + registered_model_name=None, + signature: ModelSignature = None, + input_example: ModelInputExample = None, + pip_requirements=None, + extra_pip_requirements=None, + metadata=None, + model_type="MOJO", + **kwargs, +): + """ + Log an H2O model as an MLflow artifact for the current run. + + :param h2o_model: H2O model to be saved. + :param artifact_path: Run-relative artifact path. + :param conda_env: {{ conda_env }} + :param code_paths: A list of local filesystem paths to Python file dependencies (or directories + containing file dependencies). These files are *prepended* to the system + path when the model is loaded. + :param registered_model_name: If given, create a model version under + ``registered_model_name``, also creating a registered model if one + with the given name does not exist. + + :param signature: {{ signature }} + :param input_example: {{ input_example }} + :param pip_requirements: {{ pip_requirements }} + :param extra_pip_requirements: {{ extra_pip_requirements }} + :param metadata: Custom metadata dictionary passed to the model and stored in the MLmodel file. + + .. Note:: Experimental: This parameter may change or be removed in a future + release without warning. + :param model_type: A flag deciding whether the model is MOJO or POJO. + :param kwargs: kwargs to pass to ``h2o.save_model`` method. + :return: A :py:class:`ModelInfo ` instance that contains the + metadata of the logged model. + """ + import h2o_mlflow_flavors + return Model.log( + artifact_path=artifact_path, + flavor=h2o_mlflow_flavors.h2o_gen_model, + registered_model_name=registered_model_name, + h2o_model=h2o_model, + conda_env=conda_env, + code_paths=code_paths, + signature=signature, + input_example=input_example, + pip_requirements=pip_requirements, + extra_pip_requirements=extra_pip_requirements, + model_type=model_type, + metadata=metadata, + **kwargs, + ) + + +def load_model(model_uri, dst_path=None): + path = _download_artifact_from_uri( + artifact_uri=model_uri, output_path=dst_path + ) + return _load_model(path) + + +def _load_model(path): + flavor_conf = _get_flavor_configuration(model_path=path, flavor_name=FLAVOR_NAME) + model_type = flavor_conf["model_type"] + model_path = os.path.join(path, flavor_conf["model_file"]) + genmodel_jar_path = os.path.join(path, "h2o-genmodel.jar") + + return _H2OModelWrapper(flavor_conf["model_file"], model_path, model_type, genmodel_jar_path) + + +class _H2OModelWrapper: + def __init__(self, model_file, model_path, model_type, genmodel_jar_path): + self.model_file = model_file + self.model_path = model_path + self.model_type = model_type + self.genmodel_jar_path = genmodel_jar_path + + def predict(self, dataframe, params=None): + """ + :param dataframe: Model input data. + :param params: Additional parameters to pass to the model for inference. + + :return: Model predictions. + """ + with tempfile.TemporaryDirectory() as tempdir: + input_file = os.path.join(tempdir, "input.csv") + output_file = os.path.join(tempdir, "output.csv") + dataframe.to_csv(input_file) + + if self.model_type == "MOJO": + class_path = self.genmodel_jar_path + type_parameter = "--mojo" + model_artefact = self.model_path + else: + class_path_separator = ";" if sys.platform == "win32" else ":" + class_path = self.genmodel_jar_path + class_path_separator + self.model_path + type_parameter = "--pojo" + model_artefact = self.model_file.replace(".class", "") + + java_cmd = ["java", "-cp", class_path, + "-ea", "-Xmx12g", "-XX:ReservedCodeCacheSize=256m", "-XX:MaxPermSize=2g", + "hex.genmodel.tools.PredictCsv", + "--input", input_file, "--output", output_file, type_parameter, model_artefact, "--decimal"] + ret = subprocess.call(java_cmd) + assert ret == 0, "GenModel finished with return code %d." % ret + predicted = pandas.read_csv(output_file) + predicted.index = dataframe.index + return predicted + + +def _load_pyfunc(path): + """ + Load PyFunc implementation. Called by ``pyfunc.load_model``. + + :param path: Local filesystem path to the MLflow Model with the ``h2o`` flavor. + """ + return _load_model(path) + + From 407c95e285b0ddb4cc98fee1788095805b6cee06 Mon Sep 17 00:00:00 2001 From: Marek Novotny Date: Fri, 3 Nov 2023 19:30:59 +0100 Subject: [PATCH 12/43] Fix pojo --- .../h2o_mlflow_flavors/h2o_gen_model.py | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/h2o-py-mlflow-flavors/h2o_mlflow_flavors/h2o_gen_model.py b/h2o-py-mlflow-flavors/h2o_mlflow_flavors/h2o_gen_model.py index c323ec8d6f57..3bc52b9e672d 100644 --- a/h2o-py-mlflow-flavors/h2o_mlflow_flavors/h2o_gen_model.py +++ b/h2o-py-mlflow-flavors/h2o_mlflow_flavors/h2o_gen_model.py @@ -96,8 +96,9 @@ def save_model( model_data_path = h2o_model.download_mojo(path=path, get_genmodel_jar=True) model_file = os.path.basename(model_data_path) else: - model_data_path = h2o_model.downlaod_pojo(path=path, get_genmodel_jar=True) - javac_cmd = ["javac", "-cp", h2o_genmodel_jar, "-J-Xmx12g", java_file] + model_data_path = h2o_model.download_pojo(path=path, get_genmodel_jar=True) + h2o_genmodel_jar = os.path.join(path, "h2o-genmodel.jar") + javac_cmd = ["javac", "-cp", h2o_genmodel_jar, "-J-Xmx12g", model_data_path] subprocess.check_call(javac_cmd) model_file = os.path.basename(model_data_path).replace(".java", ".class") @@ -236,8 +237,9 @@ def predict(self, dataframe, params=None): with tempfile.TemporaryDirectory() as tempdir: input_file = os.path.join(tempdir, "input.csv") output_file = os.path.join(tempdir, "output.csv") - dataframe.to_csv(input_file) - + separator = "`" + import csv + dataframe.to_csv(input_file, index=False, quoting=csv.QUOTE_NONNUMERIC, sep=separator) if self.model_type == "MOJO": class_path = self.genmodel_jar_path type_parameter = "--mojo" @@ -249,13 +251,17 @@ def predict(self, dataframe, params=None): model_artefact = self.model_file.replace(".class", "") java_cmd = ["java", "-cp", class_path, - "-ea", "-Xmx12g", "-XX:ReservedCodeCacheSize=256m", "-XX:MaxPermSize=2g", - "hex.genmodel.tools.PredictCsv", + "-ea", "-Xmx12g", "-XX:ReservedCodeCacheSize=256m", + "hex.genmodel.tools.PredictCsv", "--separator", separator, "--input", input_file, "--output", output_file, type_parameter, model_artefact, "--decimal"] ret = subprocess.call(java_cmd) assert ret == 0, "GenModel finished with return code %d." % ret predicted = pandas.read_csv(output_file) predicted.index = dataframe.index + with open(input_file, "r") as file: + print(file.read()) + with open(output_file, "r") as file: + print(file.read()) return predicted From 83e2420d962202bc8545f9a76267c4c42c3595c9 Mon Sep 17 00:00:00 2001 From: Marek Novotny Date: Mon, 6 Nov 2023 16:06:15 +0100 Subject: [PATCH 13/43] Fix pojo --- .../h2o_mlflow_flavors/h2o_gen_model.py | 26 ++++++++----------- 1 file changed, 11 insertions(+), 15 deletions(-) diff --git a/h2o-py-mlflow-flavors/h2o_mlflow_flavors/h2o_gen_model.py b/h2o-py-mlflow-flavors/h2o_mlflow_flavors/h2o_gen_model.py index 3bc52b9e672d..d5cfd6d2f829 100644 --- a/h2o-py-mlflow-flavors/h2o_mlflow_flavors/h2o_gen_model.py +++ b/h2o-py-mlflow-flavors/h2o_mlflow_flavors/h2o_gen_model.py @@ -98,9 +98,10 @@ def save_model( else: model_data_path = h2o_model.download_pojo(path=path, get_genmodel_jar=True) h2o_genmodel_jar = os.path.join(path, "h2o-genmodel.jar") - javac_cmd = ["javac", "-cp", h2o_genmodel_jar, "-J-Xmx12g", model_data_path] + output_path = os.path.join(path, "classes") + javac_cmd = ["javac", "-cp", h2o_genmodel_jar, "-d", output_path, "-J-Xmx12g", model_data_path] subprocess.check_call(javac_cmd) - model_file = os.path.basename(model_data_path).replace(".java", ".class") + model_file = os.path.basename(model_data_path).replace(".java", "") pyfunc.add_to_model( mlflow_model, @@ -214,18 +215,17 @@ def load_model(model_uri, dst_path=None): def _load_model(path): flavor_conf = _get_flavor_configuration(model_path=path, flavor_name=FLAVOR_NAME) model_type = flavor_conf["model_type"] - model_path = os.path.join(path, flavor_conf["model_file"]) - genmodel_jar_path = os.path.join(path, "h2o-genmodel.jar") + model_file = flavor_conf["model_file"] - return _H2OModelWrapper(flavor_conf["model_file"], model_path, model_type, genmodel_jar_path) + return _H2OModelWrapper(model_file, model_type, path) class _H2OModelWrapper: - def __init__(self, model_file, model_path, model_type, genmodel_jar_path): + def __init__(self, model_file, model_type, path): self.model_file = model_file - self.model_path = model_path self.model_type = model_type - self.genmodel_jar_path = genmodel_jar_path + self.path = path + self.genmodel_jar_path = os.path.join(path, "h2o-genmodel.jar") def predict(self, dataframe, params=None): """ @@ -243,10 +243,10 @@ def predict(self, dataframe, params=None): if self.model_type == "MOJO": class_path = self.genmodel_jar_path type_parameter = "--mojo" - model_artefact = self.model_path - else: + model_artefact = os.path.join(self.path, self.model_file) + else: class_path_separator = ";" if sys.platform == "win32" else ":" - class_path = self.genmodel_jar_path + class_path_separator + self.model_path + class_path = self.genmodel_jar_path + class_path_separator + os.path.join(self.path, "classes") type_parameter = "--pojo" model_artefact = self.model_file.replace(".class", "") @@ -258,10 +258,6 @@ def predict(self, dataframe, params=None): assert ret == 0, "GenModel finished with return code %d." % ret predicted = pandas.read_csv(output_file) predicted.index = dataframe.index - with open(input_file, "r") as file: - print(file.read()) - with open(output_file, "r") as file: - print(file.read()) return predicted From 01ab6ab8839864b68bc6b7de1196e7d43da79627 Mon Sep 17 00:00:00 2001 From: Marek Novotny Date: Mon, 6 Nov 2023 20:49:46 +0100 Subject: [PATCH 14/43] Add extraction of metrics --- .../h2o_mlflow_flavors/h2o_gen_model.py | 30 +++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/h2o-py-mlflow-flavors/h2o_mlflow_flavors/h2o_gen_model.py b/h2o-py-mlflow-flavors/h2o_mlflow_flavors/h2o_gen_model.py index d5cfd6d2f829..1fd0e2e5440d 100644 --- a/h2o-py-mlflow-flavors/h2o_mlflow_flavors/h2o_gen_model.py +++ b/h2o-py-mlflow-flavors/h2o_mlflow_flavors/h2o_gen_model.py @@ -63,6 +63,36 @@ def get_default_conda_env(): return _mlflow_conda_env(additional_pip_deps=get_default_pip_requirements()) +def get_params(h2o_model): + return None + + +def get_metrics(h2o_model, metric_type=None): + def get_metrics_section(output, prefix, metric_type): + is_valid = lambda key, val: isinstance(val,(type(None), bool, float, int)) and not str(key).endswith("checksum") + items = output[metric_type]._metric_json.items() + return {prefix + str(key): val for key, val in items if is_valid(key, val)} + + metric_type_lower = None + if metric_type: + metric_type_lower = metric_type.toLowerCase() + + output = h2o_model._model_json["output"] + metrics = {} + + if output["training_metrics"] and (metric_type_lower is None or metric_type_lower == "training"): + training_metrics = get_metrics_section(output, "training_", "training_metrics") + metrics = dict(metrics, **training_metrics) + if output["validation_metrics"] and (metric_type_lower is None or metric_type_lower == "validation"): + validation_metrics = get_metrics_section(output, "validation_", "validation_metrics") + metrics = dict(metrics, **validation_metrics) + if output["cross_validation_metrics"] and (metric_type_lower is None or metric_type_lower in ["cv", "cross_validation"]): + cross_validation_metrics = get_metrics_section(output, "cv_", "cross_validation_metrics") + metrics = dict(metrics, **cross_validation_metrics) + + return metrics + + def save_model( h2o_model, path, From 65499e54ef3d36931d93f77c4e9241e3fb28d2aa Mon Sep 17 00:00:00 2001 From: Marek Novotny Date: Tue, 7 Nov 2023 18:38:03 +0100 Subject: [PATCH 15/43] Fix metric extraction --- .../h2o_mlflow_flavors/h2o_gen_model.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/h2o-py-mlflow-flavors/h2o_mlflow_flavors/h2o_gen_model.py b/h2o-py-mlflow-flavors/h2o_mlflow_flavors/h2o_gen_model.py index 1fd0e2e5440d..8417fc661a19 100644 --- a/h2o-py-mlflow-flavors/h2o_mlflow_flavors/h2o_gen_model.py +++ b/h2o-py-mlflow-flavors/h2o_mlflow_flavors/h2o_gen_model.py @@ -64,14 +64,17 @@ def get_default_conda_env(): def get_params(h2o_model): - return None + return h2o_model.actual_params def get_metrics(h2o_model, metric_type=None): def get_metrics_section(output, prefix, metric_type): - is_valid = lambda key, val: isinstance(val,(type(None), bool, float, int)) and not str(key).endswith("checksum") + is_valid = lambda key, val: isinstance(val, (bool, float, int)) and not str(key).endswith("checksum") items = output[metric_type]._metric_json.items() - return {prefix + str(key): val for key, val in items if is_valid(key, val)} + dictionary = dict(items) + if dictionary["custom_metric_name"] is None: + del dictionary["custom_metric_value"] + return {prefix + str(key): val for key, val in dictionary.items() if is_valid(key, val)} metric_type_lower = None if metric_type: From bf5e77a8258a0838c008624aa5b67213168f0243 Mon Sep 17 00:00:00 2001 From: Marek Novotny Date: Wed, 8 Nov 2023 17:40:17 +0100 Subject: [PATCH 16/43] add input examples --- .../h2o_mlflow_flavors/h2o_gen_model.py | 131 ++++++++++++++---- 1 file changed, 102 insertions(+), 29 deletions(-) diff --git a/h2o-py-mlflow-flavors/h2o_mlflow_flavors/h2o_gen_model.py b/h2o-py-mlflow-flavors/h2o_mlflow_flavors/h2o_gen_model.py index 8417fc661a19..a72ba06558ff 100644 --- a/h2o-py-mlflow-flavors/h2o_mlflow_flavors/h2o_gen_model.py +++ b/h2o-py-mlflow-flavors/h2o_mlflow_flavors/h2o_gen_model.py @@ -29,7 +29,6 @@ _PythonEnv, _validate_env_arguments, ) - from mlflow.utils.file_utils import write_to from mlflow.utils.model_utils import ( _add_code_from_conf_to_system_path, @@ -39,7 +38,7 @@ ) from mlflow.utils.requirements_utils import _get_pinned_requirement from mlflow.tracking.artifact_utils import _download_artifact_from_uri - +from mlflow.models.signature import _infer_signature_from_input_example _logger = logging.getLogger(__name__) @@ -64,10 +63,29 @@ def get_default_conda_env(): def get_params(h2o_model): - return h2o_model.actual_params - + """ + Extracts training parameters for the H2O binary model. + :param h2o_model: An H2O binary model. + :return: A dictionary of parameters that were used for training the model. + """ + def is_valid(key): + return key != "model_id" and \ + not key.endswith("_frame") and \ + not key.startswith("keep_cross_validation_") + + return {key: val for key, val in h2o_model.actual_params.items() if is_valid(key)} + + def get_metrics(h2o_model, metric_type=None): + """ + Extracts metrics from the H2O binary model. + + :param h2o_model: An H2O binary model. + :param metric_type: The type of metrics. Possible values are "training", "validation", "cross_validation". + If parameter is not specified, metrics for all types are returned. + :return: A dictionary of model metrics. + """ def get_metrics_section(output, prefix, metric_type): is_valid = lambda key, val: isinstance(val, (bool, float, int)) and not str(key).endswith("checksum") items = output[metric_type]._metric_json.items() @@ -78,7 +96,7 @@ def get_metrics_section(output, prefix, metric_type): metric_type_lower = None if metric_type: - metric_type_lower = metric_type.toLowerCase() + metric_type_lower = metric_type.toLowerCase() output = h2o_model._model_json["output"] metrics = {} @@ -89,13 +107,36 @@ def get_metrics_section(output, prefix, metric_type): if output["validation_metrics"] and (metric_type_lower is None or metric_type_lower == "validation"): validation_metrics = get_metrics_section(output, "validation_", "validation_metrics") metrics = dict(metrics, **validation_metrics) - if output["cross_validation_metrics"] and (metric_type_lower is None or metric_type_lower in ["cv", "cross_validation"]): + if output["cross_validation_metrics"] and ( + metric_type_lower is None or metric_type_lower in ["cv", "cross_validation"]): cross_validation_metrics = get_metrics_section(output, "cv_", "cross_validation_metrics") metrics = dict(metrics, **cross_validation_metrics) - + return metrics +def get_input_example(h2o_model, number_of_records=5, relevant_columns_only=True): + """ + Creates an example Pandas dataset from the training dataset of H2O binary model. + + :param h2o_model: An H2O binary model. + :param number_of_records: A number of records that will be extracted from the training dataset. + :param relevant_columns_only: A flag indicating whether the output dataset should contain + only columns required by the model. Defaults to ``True``. + :return: + """ + + import h2o + frame = h2o.get_frame(h2o_model.actual_params["training_frame"]).head(number_of_records) + result = frame.as_data_frame() + if relevant_columns_only: + relevant_columns = h2o_model.varimp(use_pandas=True)["variable"].values.tolist() + input_columns = [col for col in frame.col_names if col in relevant_columns] + return result[input_columns] + else: + return result + + def save_model( h2o_model, path, @@ -106,25 +147,40 @@ def save_model( input_example=None, pip_requirements=None, extra_pip_requirements=None, - model_type="MOJO" + model_type="MOJO", + extra_prediction_args=[] ): - import h2o + """ + Saves an H2O binary model to a path on the local file system in MOJO or POJO format. + + :param h2o_model: H2O binary model to be saved to MOJO or POJO. + :param path: Local path where the model is to be saved. + :param conda_env: {{ conda_env }} + :param code_paths: A list of local filesystem paths to Python file dependencies (or directories + containing file dependencies). These files are *prepended* to the system + path when the model is loaded. + :param mlflow_model: :py:mod:`mlflow.models.Model` this flavor is being added to. + :param signature: {{ signature }} + :param input_example: {{ input_example }} + :param pip_requirements: {{ pip_requirements }} + :param extra_pip_requirements: {{ extra_pip_requirements }} + :param model_type: A flag deciding whether the model is MOJO or POJO. + :param extra_prediction_args: A list of extra arguments for java predictions process. Possible values: + --setConvertInvalidNum - Converts invalid numbers to NA + --predictContributions - Returns also Shapley values a long with the predictions + --predictCalibrated - Return also calibrated prediction values. + """ + + import h2o model_type_upper = model_type.upper() if model_type_upper != "MOJO" and model_type_upper != "POJO": raise ValueError(f"The `model_type` parameter must be 'MOJO' or 'POJO'. The passed value was '{model_type}'.") - + _validate_env_arguments(conda_env, pip_requirements, extra_pip_requirements) _validate_and_prepare_target_save_path(path) code_dir_subpath = _validate_and_copy_code_paths(code_paths, path) - if mlflow_model is None: - mlflow_model = Model() - if signature is not None: - mlflow_model.signature = signature - if input_example is not None: - _save_example(mlflow_model, input_example, path) - if model_type_upper == "MOJO": model_data_path = h2o_model.download_mojo(path=path, get_genmodel_jar=True) model_file = os.path.basename(model_data_path) @@ -135,7 +191,22 @@ def save_model( javac_cmd = ["javac", "-cp", h2o_genmodel_jar, "-d", output_path, "-J-Xmx12g", model_data_path] subprocess.check_call(javac_cmd) model_file = os.path.basename(model_data_path).replace(".java", "") + + if signature is None and input_example is not None: + wrapped_model = _H2OModelWrapper(model_file, model_type, path, extra_prediction_args) + signature = _infer_signature_from_input_example(input_example, wrapped_model) + elif signature is False: + signature = None + if mlflow_model is None: + mlflow_model = Model() + if signature is not None: + mlflow_model.signature = signature + if input_example is not None: + _save_example(mlflow_model, input_example, path) + if metadata is not None: + mlflow_model.metadata = metadata + pyfunc.add_to_model( mlflow_model, loader_module="h2o_mlflow_flavors.h2o_gen_model", @@ -149,6 +220,7 @@ def save_model( FLAVOR_NAME, model_file=model_file, model_type=model_type_upper, + extra_prediction_args=extra_prediction_args, h2o_version=h2o.__version__, code=code_dir_subpath, ) @@ -190,12 +262,12 @@ def log_model( input_example: ModelInputExample = None, pip_requirements=None, extra_pip_requirements=None, - metadata=None, model_type="MOJO", + extra_prediction_args=[], **kwargs, ): """ - Log an H2O model as an MLflow artifact for the current run. + Logs an H2O model as an MLflow artifact for the current run. :param h2o_model: H2O model to be saved. :param artifact_path: Run-relative artifact path. @@ -211,11 +283,11 @@ def log_model( :param input_example: {{ input_example }} :param pip_requirements: {{ pip_requirements }} :param extra_pip_requirements: {{ extra_pip_requirements }} - :param metadata: Custom metadata dictionary passed to the model and stored in the MLmodel file. - - .. Note:: Experimental: This parameter may change or be removed in a future - release without warning. - :param model_type: A flag deciding whether the model is MOJO or POJO. + :param model_type: A flag deciding whether the model is MOJO or POJO. + :param extra_prediction_args: A list of extra arguments for java predictions process. Possible values: + --setConvertInvalidNum - Converts invalid numbers to NA + --predictContributions - Returns also Shapley values a long with the predictions + --predictCalibrated - Return also calibrated prediction values. :param kwargs: kwargs to pass to ``h2o.save_model`` method. :return: A :py:class:`ModelInfo ` instance that contains the metadata of the logged model. @@ -234,6 +306,7 @@ def log_model( extra_pip_requirements=extra_pip_requirements, model_type=model_type, metadata=metadata, + extra_prediction_args=extra_prediction_args, **kwargs, ) @@ -249,15 +322,16 @@ def _load_model(path): flavor_conf = _get_flavor_configuration(model_path=path, flavor_name=FLAVOR_NAME) model_type = flavor_conf["model_type"] model_file = flavor_conf["model_file"] - - return _H2OModelWrapper(model_file, model_type, path) + extra_prediction_args = flavor_conf["extra_prediction_args"] + return _H2OModelWrapper(model_file, model_type, path, extra_prediction_args) class _H2OModelWrapper: - def __init__(self, model_file, model_type, path): + def __init__(self, model_file, model_type, path, extra_prediction_args): self.model_file = model_file self.model_type = model_type self.path = path + self.extra_prediction_args = extra_prediction_args if extra_prediction_args is not None else [] self.genmodel_jar_path = os.path.join(path, "h2o-genmodel.jar") def predict(self, dataframe, params=None): @@ -287,6 +361,7 @@ def predict(self, dataframe, params=None): "-ea", "-Xmx12g", "-XX:ReservedCodeCacheSize=256m", "hex.genmodel.tools.PredictCsv", "--separator", separator, "--input", input_file, "--output", output_file, type_parameter, model_artefact, "--decimal"] + java_cmd += self.extra_prediction_args ret = subprocess.call(java_cmd) assert ret == 0, "GenModel finished with return code %d." % ret predicted = pandas.read_csv(output_file) @@ -301,5 +376,3 @@ def _load_pyfunc(path): :param path: Local filesystem path to the MLflow Model with the ``h2o`` flavor. """ return _load_model(path) - - From d89754832c72ba9d2f76afd1a09e8383a364f093 Mon Sep 17 00:00:00 2001 From: Marek Novotny Date: Fri, 10 Nov 2023 09:27:20 +0100 Subject: [PATCH 17/43] moved mlflow-flavor --- build.gradle | 2 +- .../.gitignore | 0 h2o-py-mlflow-flavor/DESCRIPTION.rst | 4 + h2o-py-mlflow-flavor/README.md | 3 + .../build.gradle | 12 +- .../h2o_mlflow_flavor/__init__.py | 60 +++-- .../setup.cfg | 2 +- .../setup.py | 12 +- h2o-py-mlflow-flavors/DESCRIPTION.rst | 4 - h2o-py-mlflow-flavors/README.md | 3 - .../h2o_mlflow_flavors/__init__.py | 0 .../h2o_mlflow_flavors/h2o_mojo.py | 234 ------------------ settings.gradle | 2 +- 13 files changed, 58 insertions(+), 280 deletions(-) rename {h2o-py-mlflow-flavors => h2o-py-mlflow-flavor}/.gitignore (100%) create mode 100644 h2o-py-mlflow-flavor/DESCRIPTION.rst create mode 100644 h2o-py-mlflow-flavor/README.md rename {h2o-py-mlflow-flavors => h2o-py-mlflow-flavor}/build.gradle (89%) rename h2o-py-mlflow-flavors/h2o_mlflow_flavors/h2o_gen_model.py => h2o-py-mlflow-flavor/h2o_mlflow_flavor/__init__.py (90%) rename {h2o-py-mlflow-flavors => h2o-py-mlflow-flavor}/setup.cfg (95%) rename {h2o-py-mlflow-flavors => h2o-py-mlflow-flavor}/setup.py (86%) delete mode 100644 h2o-py-mlflow-flavors/DESCRIPTION.rst delete mode 100644 h2o-py-mlflow-flavors/README.md delete mode 100644 h2o-py-mlflow-flavors/h2o_mlflow_flavors/__init__.py delete mode 100644 h2o-py-mlflow-flavors/h2o_mlflow_flavors/h2o_mojo.py diff --git a/build.gradle b/build.gradle index 59cf91e57428..9f34251638c6 100644 --- a/build.gradle +++ b/build.gradle @@ -156,7 +156,7 @@ ext { pythonProjects = [ project(':h2o-py'), project(':h2o-py-cloud-extensions'), - project(':h2o-py-mlflow-flavors') + project(':h2o-py-mlflow-flavor') ] // The project which need to be run under CI only diff --git a/h2o-py-mlflow-flavors/.gitignore b/h2o-py-mlflow-flavor/.gitignore similarity index 100% rename from h2o-py-mlflow-flavors/.gitignore rename to h2o-py-mlflow-flavor/.gitignore diff --git a/h2o-py-mlflow-flavor/DESCRIPTION.rst b/h2o-py-mlflow-flavor/DESCRIPTION.rst new file mode 100644 index 000000000000..5969272ac94f --- /dev/null +++ b/h2o-py-mlflow-flavor/DESCRIPTION.rst @@ -0,0 +1,4 @@ +H2O-3 MLFlow Flavor +=================== + +A tiny library containing MLFlow flavor for working with H2O-3 MOJO and POJO models. diff --git a/h2o-py-mlflow-flavor/README.md b/h2o-py-mlflow-flavor/README.md new file mode 100644 index 000000000000..2b2ae69f20ab --- /dev/null +++ b/h2o-py-mlflow-flavor/README.md @@ -0,0 +1,3 @@ +# H2O-3 MLFlow Flavor + +A tiny library containing [MLFlow](https://mlflow.org/) flavors for working with H2O-3 MOJO and POJO models. diff --git a/h2o-py-mlflow-flavors/build.gradle b/h2o-py-mlflow-flavor/build.gradle similarity index 89% rename from h2o-py-mlflow-flavors/build.gradle rename to h2o-py-mlflow-flavor/build.gradle index 8b374cdbcd50..58dcd1d243d2 100644 --- a/h2o-py-mlflow-flavors/build.gradle +++ b/h2o-py-mlflow-flavor/build.gradle @@ -1,4 +1,4 @@ -description = "H2O-3 MLFlow Flavors" +description = "H2O-3 MLFlow Flavor" dependencies {} @@ -20,11 +20,11 @@ ext { // task createVersionFiles() { doLast { - file("${buildDir}/h2o_mlflow_flavors/").mkdirs() - File version_file = new File("${buildDir}/h2o_mlflow_flavors/", "version.txt") + file("${buildDir}/h2o_mlflow_flavor/").mkdirs() + File version_file = new File("${buildDir}/h2o_mlflow_flavor/", "version.txt") version_file.write(PROJECT_VERSION) - File build_file = new File("${buildDir}/h2o_mlflow_flavors/", "buildinfo.txt") + File build_file = new File("${buildDir}/h2o_mlflow_flavor/", "buildinfo.txt") build_file.write(buildVersion.toString()) } } @@ -34,7 +34,7 @@ task copySrcFiles(type: Copy) { from ("${projectDir}") { include "setup.py" include "setup.cfg" - include "h2o_mlflow_flavors/**" + include "h2o_mlflow_flavor/**" include "README.md" include "DESCRIPTION.rst" } @@ -45,7 +45,7 @@ task buildDist(type: Exec, dependsOn: [createVersionFiles, copySrcFiles]) { workingDir buildDir doFirst { file("${buildDir}/tmp").mkdirs() - standardOutput = new FileOutputStream(file("${buildDir}/tmp/h2o_mlflow_flavors_buildDist.out")) + standardOutput = new FileOutputStream(file("${buildDir}/tmp/h2o_mlflow_flavor_buildDist.out")) } commandLine getOsSpecificCommandLine([pythonexe, "setup.py", "bdist_wheel"]) } diff --git a/h2o-py-mlflow-flavors/h2o_mlflow_flavors/h2o_gen_model.py b/h2o-py-mlflow-flavor/h2o_mlflow_flavor/__init__.py similarity index 90% rename from h2o-py-mlflow-flavors/h2o_mlflow_flavors/h2o_gen_model.py rename to h2o-py-mlflow-flavor/h2o_mlflow_flavor/__init__.py index a72ba06558ff..7321515ccb2d 100644 --- a/h2o-py-mlflow-flavors/h2o_mlflow_flavors/h2o_gen_model.py +++ b/h2o-py-mlflow-flavor/h2o_mlflow_flavor/__init__.py @@ -1,5 +1,5 @@ """ -The `h2o_mlflow_flavors.h2o_gen_model` module provides an API for working with H2O MOJO and POJO models. +The `h2o_mlflow_flavor` module provides an API for working with H2O MOJO and POJO models. """ import logging @@ -16,8 +16,8 @@ from mlflow.models import Model from mlflow.models.model import MLMODEL_FILE_NAME from mlflow.models.utils import _save_example -from mlflow.models.signature import ModelSignature -from mlflow.models.utils import ModelInputExample +from mlflow.models import ModelSignature, ModelInputExample +from mlflow.types.schema import ColSpec, ParamSchema, ParamSpec, Schema, DataType from mlflow.utils.environment import ( _CONDA_ENV_FILE_NAME, _CONSTRAINTS_FILE_NAME, @@ -31,14 +31,12 @@ ) from mlflow.utils.file_utils import write_to from mlflow.utils.model_utils import ( - _add_code_from_conf_to_system_path, _get_flavor_configuration, _validate_and_copy_code_paths, _validate_and_prepare_target_save_path, ) -from mlflow.utils.requirements_utils import _get_pinned_requirement from mlflow.tracking.artifact_utils import _download_artifact_from_uri -from mlflow.models.signature import _infer_signature_from_input_example +from mlflow.types.utils import _infer_pandas_column _logger = logging.getLogger(__name__) @@ -68,11 +66,11 @@ def get_params(h2o_model): :param h2o_model: An H2O binary model. :return: A dictionary of parameters that were used for training the model. - """ - def is_valid(key): + """ + def is_valid(key): return key != "model_id" and \ - not key.endswith("_frame") and \ - not key.startswith("keep_cross_validation_") + not key.endswith("_frame") and \ + not key.startswith("keep_cross_validation_") return {key: val for key, val in h2o_model.actual_params.items() if is_valid(key)} @@ -123,9 +121,9 @@ def get_input_example(h2o_model, number_of_records=5, relevant_columns_only=True :param number_of_records: A number of records that will be extracted from the training dataset. :param relevant_columns_only: A flag indicating whether the output dataset should contain only columns required by the model. Defaults to ``True``. - :return: + :return: Pandas dataset made from the training dataset of H2O binary model """ - + import h2o frame = h2o.get_frame(h2o_model.actual_params["training_frame"]).head(number_of_records) result = frame.as_data_frame() @@ -133,10 +131,29 @@ def get_input_example(h2o_model, number_of_records=5, relevant_columns_only=True relevant_columns = h2o_model.varimp(use_pandas=True)["variable"].values.tolist() input_columns = [col for col in frame.col_names if col in relevant_columns] return result[input_columns] - else: + else: return result +def _infer_signature(h2o_model, wrapped_model, input_example): + + input_schema = _get_input_schema(h2o_model) + prediction = wrapped_model.predict(input_example) + output_schema = Schema( + [ColSpec(type=_infer_pandas_column(prediction[col]), name=col) for col in prediction.columns] + ) + return ModelSignature(inputs=input_schema, outputs=output_schema) + + +def _get_input_schema(h2o_model): + import h2o + training_frame = h2o.get_frame(h2o_model.actual_params["training_frame"]) + relevant_columns = h2o_model.varimp(use_pandas=True)["variable"].values.tolist() + input_columns = [ColSpec(name=key, type=DataType.string) + for key, val in training_frame.types.items() + if key in relevant_columns] + return Schema(input_columns) + def save_model( h2o_model, path, @@ -170,8 +187,8 @@ def save_model( --setConvertInvalidNum - Converts invalid numbers to NA --predictContributions - Returns also Shapley values a long with the predictions --predictCalibrated - Return also calibrated prediction values. - """ - + """ + import h2o model_type_upper = model_type.upper() if model_type_upper != "MOJO" and model_type_upper != "POJO": @@ -194,22 +211,20 @@ def save_model( if signature is None and input_example is not None: wrapped_model = _H2OModelWrapper(model_file, model_type, path, extra_prediction_args) - signature = _infer_signature_from_input_example(input_example, wrapped_model) + signature = _infer_signature(h2o_model, wrapped_model, input_example) elif signature is False: signature = None - + if mlflow_model is None: mlflow_model = Model() if signature is not None: mlflow_model.signature = signature if input_example is not None: _save_example(mlflow_model, input_example, path) - if metadata is not None: - mlflow_model.metadata = metadata pyfunc.add_to_model( mlflow_model, - loader_module="h2o_mlflow_flavors.h2o_gen_model", + loader_module="h2o_mlflow_flavor", model_path=model_file, conda_env=_CONDA_ENV_FILE_NAME, python_env=_PYTHON_ENV_FILE_NAME, @@ -292,10 +307,10 @@ def log_model( :return: A :py:class:`ModelInfo ` instance that contains the metadata of the logged model. """ - import h2o_mlflow_flavors + import h2o_mlflow_flavor return Model.log( artifact_path=artifact_path, - flavor=h2o_mlflow_flavors.h2o_gen_model, + flavor=h2o_mlflow_flavor, registered_model_name=registered_model_name, h2o_model=h2o_model, conda_env=conda_env, @@ -305,7 +320,6 @@ def log_model( pip_requirements=pip_requirements, extra_pip_requirements=extra_pip_requirements, model_type=model_type, - metadata=metadata, extra_prediction_args=extra_prediction_args, **kwargs, ) diff --git a/h2o-py-mlflow-flavors/setup.cfg b/h2o-py-mlflow-flavor/setup.cfg similarity index 95% rename from h2o-py-mlflow-flavors/setup.cfg rename to h2o-py-mlflow-flavor/setup.cfg index 29a557ba78a8..a986ae5a563b 100644 --- a/h2o-py-mlflow-flavors/setup.cfg +++ b/h2o-py-mlflow-flavor/setup.cfg @@ -12,7 +12,7 @@ # ignore = E241,E265,E302,E303,E701,D105 max-line-length = 120 -application-import-names = h2o_mlflow_flavors +application-import-names = h2o_mlflow_flavor import-order-style = smarkets inline-quotes = " diff --git a/h2o-py-mlflow-flavors/setup.py b/h2o-py-mlflow-flavor/setup.py similarity index 86% rename from h2o-py-mlflow-flavors/setup.py rename to h2o-py-mlflow-flavor/setup.py index e274594e3110..aeb84e7c8d50 100644 --- a/h2o-py-mlflow-flavors/setup.py +++ b/h2o-py-mlflow-flavor/setup.py @@ -2,8 +2,6 @@ from setuptools import setup, find_packages from codecs import open import os -import sys -import shutil here = os.path.abspath(os.path.dirname(__file__)) @@ -11,23 +9,23 @@ with open(os.path.join(here, 'DESCRIPTION.rst'), encoding='utf-8') as f: long_description = f.read() -version = "0.0.local" +version = "0.1.0-SNAPSHOT" # Get the version from the relevant file -with open(os.path.join(here, 'h2o_mlflow_flavors/version.txt'), encoding='utf-8') as f: +with open(os.path.join(here, 'h2o_mlflow_flavor/version.txt'), encoding='utf-8') as f: version = f.read() packages = find_packages(exclude=["tests*"]) print("Found packages: %r" % packages) setup( - name='h2o_mlflow_flavors', + name='h2o_mlflow_flavor', # Versions should comply with PEP440. For a discussion on single-sourcing # the version across setup.py and the project code, see # https://packaging.python.org/en/latest/single_source_version.html version = version, - description='Collection of extensions for integration of H2O-3 with H2O.ai Cloud', + description='A mlflow flavor for working with H2O-3 MOJO and POJO models', long_description=long_description, # The project's main homepage. @@ -68,7 +66,7 @@ "Programming Language :: Python :: 3.11", ], - keywords='machine learning, data mining, statistical analysis, modeling, big data, distributed, parallel', + keywords='ML Flow, H2O-3', packages=packages, package_data={"h2o": [ diff --git a/h2o-py-mlflow-flavors/DESCRIPTION.rst b/h2o-py-mlflow-flavors/DESCRIPTION.rst deleted file mode 100644 index 5831785a3014..000000000000 --- a/h2o-py-mlflow-flavors/DESCRIPTION.rst +++ /dev/null @@ -1,4 +0,0 @@ -H2O-3 MLFlow Flavors -==================== - -A tiny library containing MLFlow flavors for H2O-3 MOJO and POJO models. diff --git a/h2o-py-mlflow-flavors/README.md b/h2o-py-mlflow-flavors/README.md deleted file mode 100644 index cd14c59f0b5e..000000000000 --- a/h2o-py-mlflow-flavors/README.md +++ /dev/null @@ -1,3 +0,0 @@ -# H2O-3 MLFlow Flavors - -A tiny library containing [MLFlow](https://mlflow.org/) flavors for H2O-3 MOJO and POJO models. diff --git a/h2o-py-mlflow-flavors/h2o_mlflow_flavors/__init__.py b/h2o-py-mlflow-flavors/h2o_mlflow_flavors/__init__.py deleted file mode 100644 index e69de29bb2d1..000000000000 diff --git a/h2o-py-mlflow-flavors/h2o_mlflow_flavors/h2o_mojo.py b/h2o-py-mlflow-flavors/h2o_mlflow_flavors/h2o_mojo.py deleted file mode 100644 index d1314e32511b..000000000000 --- a/h2o-py-mlflow-flavors/h2o_mlflow_flavors/h2o_mojo.py +++ /dev/null @@ -1,234 +0,0 @@ -""" -The `h2o_mlflow_flavors.h2o_mojo` module provides an API for working with H2O MOJO models. -h2o3 - -""" - -import logging -import os - -import yaml - -import mlflow -from mlflow import pyfunc -from mlflow.models import Model -from mlflow.models.model import MLMODEL_FILE_NAME -from mlflow.models.utils import _save_example -from mlflow.models.signature import ModelSignature -from mlflow.models.utils import ModelInputExample -from mlflow.utils.environment import ( - _CONDA_ENV_FILE_NAME, - _CONSTRAINTS_FILE_NAME, - _PYTHON_ENV_FILE_NAME, - _REQUIREMENTS_FILE_NAME, - _mlflow_conda_env, - _process_conda_env, - _process_pip_requirements, - _PythonEnv, - _validate_env_arguments, -) - -from mlflow.utils.file_utils import write_to -from mlflow.utils.model_utils import ( - _add_code_from_conf_to_system_path, - _get_flavor_configuration, - _validate_and_copy_code_paths, - _validate_and_prepare_target_save_path, -) -from mlflow.utils.requirements_utils import _get_pinned_requirement -from mlflow.tracking.artifact_utils import _download_artifact_from_uri - - -_logger = logging.getLogger(__name__) - -FLAVOR_NAME = "h2o_mojo" - -def get_default_pip_requirements(): - """ - :return: A list of default pip requirements for MLflow Models produced by this flavor. - Calls to :func:`save_model()` and :func:`log_model()` produce a pip environment - that, at minimum, contains these requirements. - """ - return [_get_pinned_requirement("h2o")] - - -def get_default_conda_env(): - """ - :return: The default Conda environment for MLflow Models produced by calls to - :func:`save_model()` and :func:`log_model()`. - """ - return _mlflow_conda_env(additional_pip_deps=get_default_pip_requirements()) - -def save_model( - h2o_model, - path, - conda_env=None, - code_paths=None, - mlflow_model=None, - signature=None, - input_example=None, - pip_requirements=None, - extra_pip_requirements=None, -): - import h2o - - _validate_env_arguments(conda_env, pip_requirements, extra_pip_requirements) - _validate_and_prepare_target_save_path(path) - code_dir_subpath = _validate_and_copy_code_paths(code_paths, path) - - if mlflow_model is None: - mlflow_model = Model() - if signature is not None: - mlflow_model.signature = signature - if input_example is not None: - _save_example(mlflow_model, input_example, path) - - model_data_path = h2o_model.download_mojo(path=path, get_genmodel_jar=True) - model_file = os.path.basename(model_data_path) - - pyfunc.add_to_model( - mlflow_model, - loader_module="h2o_mlflow_flavors.h2o_mojo", - model_path=model_file, - conda_env=_CONDA_ENV_FILE_NAME, - python_env=_PYTHON_ENV_FILE_NAME, - code=code_dir_subpath, - ) - - mlflow_model.add_flavor( - FLAVOR_NAME, - model_file=model_file, - h2o_version=h2o.__version__, - code=code_dir_subpath, - ) - mlflow_model.save(os.path.join(path, MLMODEL_FILE_NAME)) - - if conda_env is None: - if pip_requirements is None: - default_reqs = get_default_pip_requirements() - inferred_reqs = mlflow.models.infer_pip_requirements( - path, FLAVOR_NAME, fallback=default_reqs - ) - default_reqs = sorted(set(inferred_reqs).union(default_reqs)) - else: - default_reqs = None - conda_env, pip_requirements, pip_constraints = _process_pip_requirements( - default_reqs, pip_requirements, extra_pip_requirements - ) - else: - conda_env, pip_requirements, pip_constraints = _process_conda_env(conda_env) - - with open(os.path.join(path, _CONDA_ENV_FILE_NAME), "w") as f: - yaml.safe_dump(conda_env, stream=f, default_flow_style=False) - - if pip_constraints: - write_to(os.path.join(path, _CONSTRAINTS_FILE_NAME), "\n".join(pip_constraints)) - - write_to(os.path.join(path, _REQUIREMENTS_FILE_NAME), "\n".join(pip_requirements)) - - _PythonEnv.current().to_yaml(os.path.join(path, _PYTHON_ENV_FILE_NAME)) - - -def log_model( - h2o_model, - artifact_path, - conda_env=None, - code_paths=None, - registered_model_name=None, - signature: ModelSignature = None, - input_example: ModelInputExample = None, - pip_requirements=None, - extra_pip_requirements=None, - metadata=None, - **kwargs, -): - """ - Log an H2O model as an MLflow artifact for the current run. - - :param h2o_model: H2O model to be saved. - :param artifact_path: Run-relative artifact path. - :param conda_env: {{ conda_env }} - :param code_paths: A list of local filesystem paths to Python file dependencies (or directories - containing file dependencies). These files are *prepended* to the system - path when the model is loaded. - :param registered_model_name: If given, create a model version under - ``registered_model_name``, also creating a registered model if one - with the given name does not exist. - - :param signature: {{ signature }} - :param input_example: {{ input_example }} - :param pip_requirements: {{ pip_requirements }} - :param extra_pip_requirements: {{ extra_pip_requirements }} - :param metadata: Custom metadata dictionary passed to the model and stored in the MLmodel file. - - .. Note:: Experimental: This parameter may change or be removed in a future - release without warning. - :param kwargs: kwargs to pass to ``h2o.save_model`` method. - :return: A :py:class:`ModelInfo ` instance that contains the - metadata of the logged model. - """ - import h2o_mlflow_flavors - return Model.log( - artifact_path=artifact_path, - flavor=h2o_mlflow_flavors.h2o_mojo, - registered_model_name=registered_model_name, - h2o_model=h2o_model, - conda_env=conda_env, - code_paths=code_paths, - signature=signature, - input_example=input_example, - pip_requirements=pip_requirements, - extra_pip_requirements=extra_pip_requirements, - metadata=metadata, - **kwargs, - ) - - -def load_model(model_uri, dst_path=None): - path = _download_artifact_from_uri( - artifact_uri=model_uri, output_path=dst_path - ) - return _H2OModelWrapper(_load_model(path, init=True)) - - -def _load_model(path, init=False): - import h2o - - if init: - h2o.init(strict_version_check=False) - h2o.no_progress() - - flavor_conf = _get_flavor_configuration(model_path=path, flavor_name=FLAVOR_NAME) - model_path = os.path.join(path, flavor_conf["model_file"]) - model = h2o.import_mojo(model_path) - - return model - - -class _H2OModelWrapper: - def __init__(self, h2o_model): - self.h2o_model = h2o_model - - def predict(self, dataframe, params=None): - """ - :param dataframe: Model input data. - :param params: Additional parameters to pass to the model for inference. - - :return: Model predictions. - """ - import h2o - - predicted = self.h2o_model.predict(h2o.H2OFrame(dataframe)).as_data_frame() - predicted.index = dataframe.index - return predicted - - -def _load_pyfunc(path): - """ - Load PyFunc implementation. Called by ``pyfunc.load_model``. - - :param path: Local filesystem path to the MLflow Model with the ``h2o`` flavor. - """ - return _H2OModelWrapper(_load_model(path, init=True)) - - diff --git a/settings.gradle b/settings.gradle index fcb368884731..e1d575b534f9 100644 --- a/settings.gradle +++ b/settings.gradle @@ -10,7 +10,7 @@ include 'h2o-app' include 'h2o-r' include 'h2o-py' include 'h2o-py-cloud-extensions' -include 'h2o-py-mlflow-flavors' +include 'h2o-py-mlflow-flavor' include 'h2o-assemblies:main' include 'h2o-assemblies:minimal' include 'h2o-assemblies:steam' From 07443e60d8d66e36aeb8a889f8790ecf669837e3 Mon Sep 17 00:00:00 2001 From: Marek Novotny Date: Fri, 10 Nov 2023 18:20:17 +0100 Subject: [PATCH 18/43] Add examples --- h2o-py-mlflow-flavor/examples/DRF_mojo.ipynb | 125 ++++++++++++++++++ .../examples/KMeans_pojo.ipynb | 121 +++++++++++++++++ .../h2o_mlflow_flavor/__init__.py | 47 +++---- 3 files changed, 262 insertions(+), 31 deletions(-) create mode 100644 h2o-py-mlflow-flavor/examples/DRF_mojo.ipynb create mode 100644 h2o-py-mlflow-flavor/examples/KMeans_pojo.ipynb diff --git a/h2o-py-mlflow-flavor/examples/DRF_mojo.ipynb b/h2o-py-mlflow-flavor/examples/DRF_mojo.ipynb new file mode 100644 index 000000000000..7327c94f9a0d --- /dev/null +++ b/h2o-py-mlflow-flavor/examples/DRF_mojo.ipynb @@ -0,0 +1,125 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "id": "3ded5553", + "metadata": {}, + "outputs": [], + "source": [ + "# Start H2O-3 runtime.\n", + "\n", + "import h2o\n", + "h2o.init(strict_version_check=False)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5e746ad4", + "metadata": {}, + "outputs": [], + "source": [ + "# Configure DRF algorithm and train a model.\n", + "\n", + "from h2o.estimators import H2ORandomForestEstimator\n", + "\n", + "# Import the cars dataset into H2O:\n", + "cars = h2o.import_file(\"https://s3.amazonaws.com/h2o-public-test-data/smalldata/junit/cars_20mpg.csv\")\n", + "\n", + "# Set the predictors and response;\n", + "# set the response as a factor:\n", + "cars[\"economy_20mpg\"] = cars[\"economy_20mpg\"].asfactor()\n", + "predictors = [\"displacement\",\"power\",\"weight\",\"acceleration\",\"year\"]\n", + "response = \"economy_20mpg\"\n", + "\n", + "# Split the dataset into a train and valid set:\n", + "train, valid = cars.split_frame(ratios=[.8], seed=1234)\n", + "drf = H2ORandomForestEstimator(ntrees=10,\n", + " max_depth=5,\n", + " min_rows=10,\n", + " calibrate_model=True,\n", + " calibration_frame=valid,\n", + " binomial_double_trees=True)\n", + "drf.train(x=predictors,\n", + " y=response,\n", + " training_frame=train,\n", + " validation_frame=valid)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "29eb0722", + "metadata": {}, + "outputs": [], + "source": [ + "# Log the model to an MLFlow reqistry.\n", + "\n", + "import mlflow\n", + "import h2o_mlflow_flavor\n", + "mlflow.set_tracking_uri(\"http://127.0.0.1:8080\")\n", + "\n", + "with mlflow.start_run(run_name=\"cars\") as run:\n", + " mlflow.log_params(h2o_mlflow_flavor.get_params(drf)) # Log training parameters of the model (optional).\n", + " mlflow.log_metrics(h2o_mlflow_flavor.get_metrics(drf)) # Log performance matrics of the model (optional).\n", + " input_example = h2o_mlflow_flavor.get_input_example(drf) # Extract input example from training dataset (optional)\n", + " h2o_mlflow_flavor.log_model(drf, \"cars\", input_example=input_example,\n", + " model_type=\"MOJO\", # Specify whether the output model should be MOJO or POJO. (MOJO is default)\n", + " extra_prediction_args=[\"--predictCalibrated\"]) # Add extra prediction args if needed." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "bed1dafe", + "metadata": {}, + "outputs": [], + "source": [ + "# Load model from the MLFlow registry and score with the model.\n", + "\n", + "import mlflow\n", + "mlflow.set_tracking_uri(\"http://127.0.0.1:8080\")\n", + "\n", + "logged_model = 'runs:/a9ff364f07fa499eb44e7c49e47fab11/cars' # Specify correct id of your run.\n", + "\n", + "# Load model as a PyFuncModel.\n", + "loaded_model = mlflow.pyfunc.load_model(logged_model)\n", + "\n", + "# Predict on a Pandas DataFrame.\n", + "import pandas as pd\n", + "data = pd.read_csv(\"https://s3.amazonaws.com/h2o-public-test-data/smalldata/junit/cars_20mpg.csv\")\n", + "loaded_model.predict(data)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "905b0c4c", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "mlflow", + "language": "python", + "name": "mlflow" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.5" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/h2o-py-mlflow-flavor/examples/KMeans_pojo.ipynb b/h2o-py-mlflow-flavor/examples/KMeans_pojo.ipynb new file mode 100644 index 000000000000..e83f909b085b --- /dev/null +++ b/h2o-py-mlflow-flavor/examples/KMeans_pojo.ipynb @@ -0,0 +1,121 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "id": "3ded5553", + "metadata": {}, + "outputs": [], + "source": [ + "# Start H2O-3 runtime.\n", + "\n", + "import h2o\n", + "h2o.init(strict_version_check=False)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5e746ad4", + "metadata": {}, + "outputs": [], + "source": [ + "# Configure K-Means algorithm and train a model.\n", + "\n", + "from h2o.estimators import H2OKMeansEstimator\n", + "\n", + "# Import the iris dataset into H2O:\n", + "iris = h2o.import_file(\"http://h2o-public-test-data.s3.amazonaws.com/smalldata/iris/iris_wheader.csv\")\n", + "\n", + "# Set the predictors:\n", + "predictors = [\"sepal_len\", \"sepal_wid\", \"petal_len\", \"petal_wid\"]\n", + "\n", + "# Split the dataset into a train and valid set:\n", + "train, valid = iris.split_frame(ratios=[.8], seed=1234)\n", + "\n", + "# Build and train the model:\n", + "kmeans = H2OKMeansEstimator(k=10,\n", + " estimate_k=True,\n", + " standardize=False,\n", + " seed=1234)\n", + "kmeans.train(x=predictors,\n", + " training_frame=train,\n", + " validation_frame=valid)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "29eb0722", + "metadata": {}, + "outputs": [], + "source": [ + "# Log the model to an MLFlow reqistry.\n", + "\n", + "import mlflow\n", + "import h2o_mlflow_flavor\n", + "mlflow.set_tracking_uri(\"http://127.0.0.1:8080\")\n", + "\n", + "with mlflow.start_run(run_name=\"iris\") as run:\n", + " mlflow.log_params(h2o_mlflow_flavor.get_params(kmeans)) # Log training parameters of the model (optional).\n", + " mlflow.log_metrics(h2o_mlflow_flavor.get_metrics(kmeans)) # Log performance matrics of the model (optional).\n", + " input_example = h2o_mlflow_flavor.get_input_example(kmeans) # Extract input example from training dataset (optional)\n", + " h2o_mlflow_flavor.log_model(kmeans, \"iris\", input_example=input_example,\n", + " model_type=\"POJO\") # Specify whether the output model should be MOJO or POJO. (MOJO is default)\n", + " " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "bed1dafe", + "metadata": {}, + "outputs": [], + "source": [ + "# Load model from the MLFlow registry and score with the model.\n", + "\n", + "import mlflow\n", + "mlflow.set_tracking_uri(\"http://127.0.0.1:8080\")\n", + "\n", + "logged_model = 'runs:/9a42265cf0ef484c905b02afb8fe6246/iris' # Specify correct id of your run.\n", + "\n", + "# Load model as a PyFuncModel.\n", + "loaded_model = mlflow.pyfunc.load_model(logged_model)\n", + "\n", + "# Predict on a Pandas DataFrame.\n", + "import pandas as pd\n", + "data = pd.read_csv(\"http://h2o-public-test-data.s3.amazonaws.com/smalldata/iris/iris_wheader.csv\")\n", + "loaded_model.predict(data)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "905b0c4c", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "mlflow", + "language": "python", + "name": "mlflow" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.5" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/h2o-py-mlflow-flavor/h2o_mlflow_flavor/__init__.py b/h2o-py-mlflow-flavor/h2o_mlflow_flavor/__init__.py index 7321515ccb2d..21323b8041d9 100644 --- a/h2o-py-mlflow-flavor/h2o_mlflow_flavor/__init__.py +++ b/h2o-py-mlflow-flavor/h2o_mlflow_flavor/__init__.py @@ -17,7 +17,6 @@ from mlflow.models.model import MLMODEL_FILE_NAME from mlflow.models.utils import _save_example from mlflow.models import ModelSignature, ModelInputExample -from mlflow.types.schema import ColSpec, ParamSchema, ParamSpec, Schema, DataType from mlflow.utils.environment import ( _CONDA_ENV_FILE_NAME, _CONSTRAINTS_FILE_NAME, @@ -36,11 +35,10 @@ _validate_and_prepare_target_save_path, ) from mlflow.tracking.artifact_utils import _download_artifact_from_uri -from mlflow.types.utils import _infer_pandas_column _logger = logging.getLogger(__name__) -FLAVOR_NAME = "h2o_gen_model" +FLAVOR_NAME = "h2o_mojo_pojo" def get_default_pip_requirements(): @@ -128,31 +126,20 @@ def get_input_example(h2o_model, number_of_records=5, relevant_columns_only=True frame = h2o.get_frame(h2o_model.actual_params["training_frame"]).head(number_of_records) result = frame.as_data_frame() if relevant_columns_only: - relevant_columns = h2o_model.varimp(use_pandas=True)["variable"].values.tolist() + relevant_columns = _get_relevant_columns(h2o_model) input_columns = [col for col in frame.col_names if col in relevant_columns] return result[input_columns] else: return result -def _infer_signature(h2o_model, wrapped_model, input_example): - - input_schema = _get_input_schema(h2o_model) - prediction = wrapped_model.predict(input_example) - output_schema = Schema( - [ColSpec(type=_infer_pandas_column(prediction[col]), name=col) for col in prediction.columns] - ) - return ModelSignature(inputs=input_schema, outputs=output_schema) - - -def _get_input_schema(h2o_model): - import h2o - training_frame = h2o.get_frame(h2o_model.actual_params["training_frame"]) - relevant_columns = h2o_model.varimp(use_pandas=True)["variable"].values.tolist() - input_columns = [ColSpec(name=key, type=DataType.string) - for key, val in training_frame.types.items() - if key in relevant_columns] - return Schema(input_columns) +def _get_relevant_columns(h2o_model): + names = h2o_model._model_json["output"]["original_names"] or h2o_model._model_json["output"]["names"] + response_column = h2o_model.actual_params.get("response_column") + ignored_columns = h2o_model.actual_params.get("ignored_columns") or [] + irrelevant_columns = ignored_columns + [response_column] if response_column else ignored_columns + relevant_columns = [feature for feature in names if feature not in irrelevant_columns] + return relevant_columns def save_model( h2o_model, @@ -209,12 +196,6 @@ def save_model( subprocess.check_call(javac_cmd) model_file = os.path.basename(model_data_path).replace(".java", "") - if signature is None and input_example is not None: - wrapped_model = _H2OModelWrapper(model_file, model_type, path, extra_prediction_args) - signature = _infer_signature(h2o_model, wrapped_model, input_example) - elif signature is False: - signature = None - if mlflow_model is None: mlflow_model = Model() if signature is not None: @@ -236,6 +217,7 @@ def save_model( model_file=model_file, model_type=model_type_upper, extra_prediction_args=extra_prediction_args, + relevant_columns=_get_relevant_columns(h2o_model), h2o_version=h2o.__version__, code=code_dir_subpath, ) @@ -337,15 +319,17 @@ def _load_model(path): model_type = flavor_conf["model_type"] model_file = flavor_conf["model_file"] extra_prediction_args = flavor_conf["extra_prediction_args"] - return _H2OModelWrapper(model_file, model_type, path, extra_prediction_args) + relevant_columns = flavor_conf["relevant_columns"] + return _H2OModelWrapper(model_file, model_type, path, extra_prediction_args, relevant_columns) class _H2OModelWrapper: - def __init__(self, model_file, model_type, path, extra_prediction_args): + def __init__(self, model_file, model_type, path, extra_prediction_args, relevant_columns): self.model_file = model_file self.model_type = model_type self.path = path self.extra_prediction_args = extra_prediction_args if extra_prediction_args is not None else [] + self.relevant_columns = relevant_columns self.genmodel_jar_path = os.path.join(path, "h2o-genmodel.jar") def predict(self, dataframe, params=None): @@ -360,7 +344,8 @@ def predict(self, dataframe, params=None): output_file = os.path.join(tempdir, "output.csv") separator = "`" import csv - dataframe.to_csv(input_file, index=False, quoting=csv.QUOTE_NONNUMERIC, sep=separator) + sub_dataframe = dataframe[self.relevant_columns] + sub_dataframe.to_csv(input_file, index=False, quoting=csv.QUOTE_NONNUMERIC, sep=separator) if self.model_type == "MOJO": class_path = self.genmodel_jar_path type_parameter = "--mojo" From cbfdd3a7f905feed454930f3aa40fd49278a28d7 Mon Sep 17 00:00:00 2001 From: Marek Novotny Date: Mon, 13 Nov 2023 15:05:19 +0100 Subject: [PATCH 19/43] Add doc --- h2o-py-mlflow-flavor/README.md | 91 ++++++++++++++++++- .../h2o_mlflow_flavor/__init__.py | 8 +- 2 files changed, 97 insertions(+), 2 deletions(-) diff --git a/h2o-py-mlflow-flavor/README.md b/h2o-py-mlflow-flavor/README.md index 2b2ae69f20ab..cc41d6b7c939 100644 --- a/h2o-py-mlflow-flavor/README.md +++ b/h2o-py-mlflow-flavor/README.md @@ -1,3 +1,92 @@ # H2O-3 MLFlow Flavor -A tiny library containing [MLFlow](https://mlflow.org/) flavors for working with H2O-3 MOJO and POJO models. +A tiny library containing a [MLFlow](https://mlflow.org/) flavor for working with H2O-3 MOJO and POJO models. + +## Logging Models to MLFlow Registry + +The model that was trained with H2O-3 runtime can be exported to MLFlow registry with `log_model` function.: +```python +import mlflow +import h2o_mlflow_flavor +mlflow.set_tracking_uri("http://127.0.0.1:8080") + +h2o_model = ... training phase ... + +with mlflow.start_run(run_name="myrun") as run: + h2o_mlflow_flavor.log_model(h2o_model=h2o_model, + artifact_path="folder", + model_type="MOJO", + extra_prediction_args=["--predictCalibrated"]) +``` + +Compared to `log_model` functions of the other flavors being a part of MLFlow, this function has two extra arguments: + +* **model_type** - It indicates whether the model should be exported as + [MOJO](https://docs.h2o.ai/h2o/latest-stable/h2o-docs/mojo-quickstart.html#what-is-a-mojo) + or [POJO](https://docs.h2o.ai/h2o/latest-stable/h2o-docs/pojo-quickstart.html#what-is-a-pojo). + The default value is `MOJO`. +* **extra_prediction_args** A list of extra arguments for java scoring process. Possible values: + * `--setConvertInvalidNum` - The scoring process will convert invalid numbers to NA. + * `--predictContributions` - The scoring process will Return also Shapley values a long with the predictions. + Model must support that Shapley values, otherwise scoring process will throw an error. + * `--predictCalibrated` - The scoring process will also return calibrated prediction values. + +The `save_model` function that persists h2o binary model to MOJO or POJO has the same signature as the `log_model` function. + +## Extracting Information about Model +The flavor offers several functions to extract information about the model. + +* `get_metrics(h2o_model, metric_type=None)` - Extracts metrics from the trained H2O binary model. It returns dictionary and +takes following parameters: + * `h2o_model` - An H2O binary model. + * `metric_type` - The type of metrics. Possible values are "training", "validation", "cross_validation". + If parameter is not specified, metrics for all types are returned. +* `get_params(h2o_model)` - Extracts training parameters for the H2O binary model. It returns dictionary and expects one +parameter: + * `h2o_model` - An H2O binary model. +* `get_input_example(h2o_model, number_of_records=5, relevant_columns_only=True)` - Creates an example Pandas dataset +from the training dataset of H2O binary model. It takes following parameters: + * `h2o_model` - An H2O binary model. + * `number_of_records` - A number of records that will be extracted from the training dataset. + * `relevant_columns_only` - A flag indicating whether the output dataset should contain only columns required by + the model. Defaults to `True`. + +The functions can be utilized as follows: +```python +import mlflow +import h2o_mlflow_flavor +mlflow.set_tracking_uri("http://127.0.0.1:8080") + +h2o_model = ... training phase ... + +with mlflow.start_run(run_name="myrun") as run: + mlflow.log_params(h2o_mlflow_flavor.get_params(h2o_model)) + mlflow.log_metrics(h2o_mlflow_flavor.get_metrics(h2o_model)) + input_example = h2o_mlflow_flavor.get_input_example(h2o_model) + h2o_mlflow_flavor.log_model(h2o_model=h2o_model, + input_example=input_example, + artifact_path="folder", + model_type="MOJO", + extra_prediction_args=["--predictCalibrated"]) +``` + +## Model Scoring +After a model obtained from the model registry, the model doesn't require h2o runtime for ability to score. The only thing +that model requires is a `h2o-gemodel.jar` which was persisted with the model during saving procedure. + +The model could be loaded by the function `load_model(model_uri, dst_path=None)`. It returns an objecting making +predictions on Pandas dataframe and takes the following parameters: +* `model_uri` - An unique identifier of the model within MLFlow registry. +* `dst_path` - (Optional) A local filesystem path for downloading the persisted form of the model. + +The object for scoring could be obtained also via the `pyfunc` flavor as follows: +```python +import mlflow +mlflow.set_tracking_uri("http://127.0.0.1:8080") + +logged_model = 'runs:/9a42265cf0ef484c905b02afb8fe6246/iris' +loaded_model = mlflow.pyfunc.load_model(logged_model) +import pandas as pd +data = pd.read_csv("http://h2o-public-test-data.s3.amazonaws.com/smalldata/iris/iris_wheader.csv") +loaded_model.predict(data) +``` diff --git a/h2o-py-mlflow-flavor/h2o_mlflow_flavor/__init__.py b/h2o-py-mlflow-flavor/h2o_mlflow_flavor/__init__.py index 21323b8041d9..cdbfc0fbc511 100644 --- a/h2o-py-mlflow-flavor/h2o_mlflow_flavor/__init__.py +++ b/h2o-py-mlflow-flavor/h2o_mlflow_flavor/__init__.py @@ -281,7 +281,7 @@ def log_model( :param pip_requirements: {{ pip_requirements }} :param extra_pip_requirements: {{ extra_pip_requirements }} :param model_type: A flag deciding whether the model is MOJO or POJO. - :param extra_prediction_args: A list of extra arguments for java predictions process. Possible values: + :param extra_prediction_args: A list of extra arguments for java scoring process. Possible values: --setConvertInvalidNum - Converts invalid numbers to NA --predictContributions - Returns also Shapley values a long with the predictions --predictCalibrated - Return also calibrated prediction values. @@ -308,6 +308,12 @@ def log_model( def load_model(model_uri, dst_path=None): + """ + Obtains a model from MLFlow registry. + :param model_uri: An unique identifier of the model within MLFlow registry. + :param dst_path: (Optional) A temporary folder for downloading the persisted form of the model. + :return: A model making predictions on Pandas dataframe. + """ path = _download_artifact_from_uri( artifact_uri=model_uri, output_path=dst_path ) From 38bdb5f1c057da2681e1b73bbc9997cb1a73b3db Mon Sep 17 00:00:00 2001 From: Marek Novotny Date: Mon, 13 Nov 2023 15:16:23 +0100 Subject: [PATCH 20/43] Add description.rst --- h2o-py-mlflow-flavor/DESCRIPTION.rst | 152 ++++++++++++++++++++++++++- 1 file changed, 151 insertions(+), 1 deletion(-) diff --git a/h2o-py-mlflow-flavor/DESCRIPTION.rst b/h2o-py-mlflow-flavor/DESCRIPTION.rst index 5969272ac94f..95b048870f99 100644 --- a/h2o-py-mlflow-flavor/DESCRIPTION.rst +++ b/h2o-py-mlflow-flavor/DESCRIPTION.rst @@ -1,4 +1,154 @@ H2O-3 MLFlow Flavor =================== -A tiny library containing MLFlow flavor for working with H2O-3 MOJO and POJO models. +A tiny library containing a `MLFlow `_ flavor for working with H2O-3 MOJO and POJO models. + +# Logging Models to MLFlow Registry +=================================== + +The model that was trained with H2O-3 runtime can be exported to MLFlow registry with `log_model` function.: + +```python + +import mlflow + +import h2o*mlflow*flavor + +mlflow.set*tracking*uri("http://127.0.0.1:8080") + +h2o_model = ... training phase ... + +with mlflow.start*run(run*name="myrun") as run: + + h2o\_mlflow\_flavor.log\_model(h2o\_model=h2o\_model, + + artifact\_path="folder", + + model\_type="MOJO", + + extra\_prediction\_args=["\-\-predictCalibrated"]) + +``` + +Compared to `log_model` functions of the other flavors being a part of MLFlow, this function has two extra arguments: + +* **model_type** - It indicates whether the model should be exported as + + [MOJO](https://docs.h2o.ai/h2o/latest\-stable/h2o\-docs/mojo\-quickstart.html#what\-is\-a\-mojo) + + or [POJO](https://docs.h2o.ai/h2o/latest\-stable/h2o\-docs/pojo\-quickstart.html#what\-is\-a\-pojo). + + The default value is `MOJO`. + +* **extra*prediction*args** A list of extra arguments for java scoring process. Possible values: + + \* `\-\-setConvertInvalidNum` \- The scoring process will convert invalid numbers to NA. + + \* `\-\-predictContributions` \- The scoring process will Return also Shapley values a long with the predictions. + + Model must support that Shapley values, otherwise scoring process will throw an error. + + \* `\-\-predictCalibrated` \- The scoring process will also return calibrated prediction values. + +The `save*model` function that persists h2o binary model to MOJO or POJO has the same signature as the `log*model` function. + +# Extracting Information about Model +==================================== + +The flavor offers several functions to extract information about the model. + +* `get*metrics(h2o*model, metric_type=None)` - Extracts metrics from the trained H2O binary model. It returns dictionary and + +takes following parameters: + + \* `h2o\_model` \- An H2O binary model. + + \* `metric\_type` \- The type of metrics. Possible values are "training", "validation", "cross\_validation". + + If parameter is not specified, metrics for all types are returned. + +* `get*params(h2o*model)` - Extracts training parameters for the H2O binary model. It returns dictionary and expects one + +parameter: + + \* `h2o\_model` \- An H2O binary model. + +* `get*input*example(h2o*model, number*of*records=5, relevant*columns_only=True)` - Creates an example Pandas dataset + +from the training dataset of H2O binary model. It takes following parameters: + + \* `h2o\_model` \- An H2O binary model. + + \* `number\_of\_records` \- A number of records that will be extracted from the training dataset. + + \* `relevant\_columns\_only` \- A flag indicating whether the output dataset should contain only columns required by + + the model. Defaults to `True`. + +The functions can be utilized as follows: + +```python + +import mlflow + +import h2o*mlflow*flavor + +mlflow.set*tracking*uri("http://127.0.0.1:8080") + +h2o_model = ... training phase ... + +with mlflow.start*run(run*name="myrun") as run: + + mlflow.log\_params(h2o\_mlflow\_flavor.get\_params(h2o\_model)) + + mlflow.log\_metrics(h2o\_mlflow\_flavor.get\_metrics(h2o\_model)) + + input\_example = h2o\_mlflow\_flavor.get\_input\_example(h2o\_model) + + h2o\_mlflow\_flavor.log\_model(h2o\_model=h2o\_model, + + input\_example=input\_example, + + artifact\_path="folder", + + model\_type="MOJO", + + extra\_prediction\_args=["\-\-predictCalibrated"]) + +``` + +# Model Scoring +=============== + +After a model obtained from the model registry, the model doesn't require h2o runtime for ability to score. The only thing + +that model requires is a `h2o-gemodel.jar` which was persisted with the model during saving procedure. + +The model could be loaded by the function `load*model(model*uri, dst_path=None)`. It returns an objecting making + +predictions on Pandas dataframe and takes the following parameters: + +* `model_uri` - An unique identifier of the model within MLFlow registry. + +* `dst_path` - (Optional) A local filesystem path for downloading the persisted form of the model. + +The object for scoring could be obtained also via the `pyfunc` flavor as follows: + +```python + +import mlflow + +mlflow.set*tracking*uri("http://127.0.0.1:8080") + +logged_model = 'runs:/9a42265cf0ef484c905b02afb8fe6246/iris' + +loaded*model = mlflow.pyfunc.load*model(logged_model) + +import pandas as pd + +data = pd.read_csv("http://h2o-public-test-data.s3.amazonaws.com/smalldata/iris/iris_wheader.csv") + +loaded_model.predict(data) + +``` + From 68f5989fe4a8f362e4d2946bb0d8e31997dc495b Mon Sep 17 00:00:00 2001 From: Marek Novotny Date: Mon, 13 Nov 2023 15:18:53 +0100 Subject: [PATCH 21/43] Update description --- h2o-py-mlflow-flavor/DESCRIPTION.rst | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/h2o-py-mlflow-flavor/DESCRIPTION.rst b/h2o-py-mlflow-flavor/DESCRIPTION.rst index 95b048870f99..0ece791faad5 100644 --- a/h2o-py-mlflow-flavor/DESCRIPTION.rst +++ b/h2o-py-mlflow-flavor/DESCRIPTION.rst @@ -3,8 +3,8 @@ H2O-3 MLFlow Flavor A tiny library containing a `MLFlow `_ flavor for working with H2O-3 MOJO and POJO models. -# Logging Models to MLFlow Registry -=================================== +Logging Models to MLFlow Registry +================================= The model that was trained with H2O-3 runtime can be exported to MLFlow registry with `log_model` function.: @@ -52,8 +52,8 @@ Compared to `log_model` functions of the other flavors being a part of MLFlow, t The `save*model` function that persists h2o binary model to MOJO or POJO has the same signature as the `log*model` function. -# Extracting Information about Model -==================================== +Extracting Information about Model +================================== The flavor offers several functions to extract information about the model. @@ -117,8 +117,8 @@ with mlflow.start*run(run*name="myrun") as run: ``` -# Model Scoring -=============== +Model Scoring +============= After a model obtained from the model registry, the model doesn't require h2o runtime for ability to score. The only thing From e7463f1235cd59d702548de45c90ed71467306fa Mon Sep 17 00:00:00 2001 From: Marek Novotny Date: Mon, 13 Nov 2023 15:28:35 +0100 Subject: [PATCH 22/43] description --- h2o-py-mlflow-flavor/DESCRIPTION.rst | 78 ++++++++++------------------ 1 file changed, 26 insertions(+), 52 deletions(-) diff --git a/h2o-py-mlflow-flavor/DESCRIPTION.rst b/h2o-py-mlflow-flavor/DESCRIPTION.rst index 0ece791faad5..e7541b42d53d 100644 --- a/h2o-py-mlflow-flavor/DESCRIPTION.rst +++ b/h2o-py-mlflow-flavor/DESCRIPTION.rst @@ -4,31 +4,23 @@ H2O-3 MLFlow Flavor A tiny library containing a `MLFlow `_ flavor for working with H2O-3 MOJO and POJO models. Logging Models to MLFlow Registry -================================= +--------------------------------- The model that was trained with H2O-3 runtime can be exported to MLFlow registry with `log_model` function.: -```python + .. code-block:: python import mlflow - import h2o*mlflow*flavor mlflow.set*tracking*uri("http://127.0.0.1:8080") - h2o_model = ... training phase ... - with mlflow.start*run(run*name="myrun") as run: - - h2o\_mlflow\_flavor.log\_model(h2o\_model=h2o\_model, - + h2o\_mlflow\_flavor.log\_model(h2o\_model=h2o\_model, artifact\_path="folder", - model\_type="MOJO", - extra\_prediction\_args=["\-\-predictCalibrated"]) -``` Compared to `log_model` functions of the other flavors being a part of MLFlow, this function has two extra arguments: @@ -42,113 +34,95 @@ Compared to `log_model` functions of the other flavors being a part of MLFlow, t * **extra*prediction*args** A list of extra arguments for java scoring process. Possible values: - \* `\-\-setConvertInvalidNum` \- The scoring process will convert invalid numbers to NA. + \* ``\-\-setConvertInvalidNum`` \- The scoring process will convert invalid numbers to NA. - \* `\-\-predictContributions` \- The scoring process will Return also Shapley values a long with the predictions. + \* ``\-\-predictContributions`` \- The scoring process will Return also Shapley values a long with the predictions. Model must support that Shapley values, otherwise scoring process will throw an error. - \* `\-\-predictCalibrated` \- The scoring process will also return calibrated prediction values. + \* ``\-\-predictCalibrated`` \- The scoring process will also return calibrated prediction values. The `save*model` function that persists h2o binary model to MOJO or POJO has the same signature as the `log*model` function. Extracting Information about Model -================================== +---------------------------------- The flavor offers several functions to extract information about the model. -* `get*metrics(h2o*model, metric_type=None)` - Extracts metrics from the trained H2O binary model. It returns dictionary and +* `get_metrics(h2o*model, metric_type=None)` - Extracts metrics from the trained H2O binary model. It returns dictionary and takes following parameters: - \* `h2o\_model` \- An H2O binary model. + * ``h2o\_model`` \- An H2O binary model. - \* `metric\_type` \- The type of metrics. Possible values are "training", "validation", "cross\_validation". + * ``metric\_type`` \- The type of metrics. Possible values are "training", "validation", "cross\_validation". If parameter is not specified, metrics for all types are returned. -* `get*params(h2o*model)` - Extracts training parameters for the H2O binary model. It returns dictionary and expects one +* ``get_params(h2o*model)`` - Extracts training parameters for the H2O binary model. It returns dictionary and expects one parameter: - \* `h2o\_model` \- An H2O binary model. + * ``h2o\_model`` \- An H2O binary model. -* `get*input*example(h2o*model, number*of*records=5, relevant*columns_only=True)` - Creates an example Pandas dataset +* ``get_input_example(h2o*model, number*of*records=5, relevant*columns_only=True)`` - Creates an example Pandas dataset from the training dataset of H2O binary model. It takes following parameters: - \* `h2o\_model` \- An H2O binary model. + * ``h2o\_model`` \- An H2O binary model. - \* `number\_of\_records` \- A number of records that will be extracted from the training dataset. + * ``number\_of\_records`` \- A number of records that will be extracted from the training dataset. - \* `relevant\_columns\_only` \- A flag indicating whether the output dataset should contain only columns required by + * ``relevant\_columns\_only`` \- A flag indicating whether the output dataset should contain only columns required by - the model. Defaults to `True`. + the model. Defaults to ``True``. The functions can be utilized as follows: -```python + .. code-block:: python import mlflow - import h2o*mlflow*flavor - mlflow.set*tracking*uri("http://127.0.0.1:8080") h2o_model = ... training phase ... with mlflow.start*run(run*name="myrun") as run: - mlflow.log\_params(h2o\_mlflow\_flavor.get\_params(h2o\_model)) - mlflow.log\_metrics(h2o\_mlflow\_flavor.get\_metrics(h2o\_model)) - input\_example = h2o\_mlflow\_flavor.get\_input\_example(h2o\_model) - h2o\_mlflow\_flavor.log\_model(h2o\_model=h2o\_model, - input\_example=input\_example, - artifact\_path="folder", - model\_type="MOJO", - extra\_prediction\_args=["\-\-predictCalibrated"]) -``` Model Scoring -============= +------------- After a model obtained from the model registry, the model doesn't require h2o runtime for ability to score. The only thing -that model requires is a `h2o-gemodel.jar` which was persisted with the model during saving procedure. +that model requires is a ``h2o-gemodel.jar`` which was persisted with the model during saving procedure. -The model could be loaded by the function `load*model(model*uri, dst_path=None)`. It returns an objecting making +The model could be loaded by the function ``load*model(model*uri, dst_path=None)``. It returns an objecting making predictions on Pandas dataframe and takes the following parameters: -* `model_uri` - An unique identifier of the model within MLFlow registry. +* ``model_uri`` - An unique identifier of the model within MLFlow registry. -* `dst_path` - (Optional) A local filesystem path for downloading the persisted form of the model. +* ``dst_path`` - (Optional) A local filesystem path for downloading the persisted form of the model. The object for scoring could be obtained also via the `pyfunc` flavor as follows: -```python + .. code-block:: python import mlflow - -mlflow.set*tracking*uri("http://127.0.0.1:8080") +mlflow.set_tracking_uri("http://127.0.0.1:8080") logged_model = 'runs:/9a42265cf0ef484c905b02afb8fe6246/iris' - -loaded*model = mlflow.pyfunc.load*model(logged_model) +loaded_model = mlflow.pyfunc.load_model(logged_model) import pandas as pd - data = pd.read_csv("http://h2o-public-test-data.s3.amazonaws.com/smalldata/iris/iris_wheader.csv") - loaded_model.predict(data) - -``` - From c7424823665646e680b2039f92549d88dcb1cf25 Mon Sep 17 00:00:00 2001 From: Marek Novotny Date: Mon, 13 Nov 2023 15:33:32 +0100 Subject: [PATCH 23/43] description --- h2o-py-mlflow-flavor/DESCRIPTION.rst | 64 ++++++++++++++-------------- 1 file changed, 31 insertions(+), 33 deletions(-) diff --git a/h2o-py-mlflow-flavor/DESCRIPTION.rst b/h2o-py-mlflow-flavor/DESCRIPTION.rst index e7541b42d53d..24ddad582753 100644 --- a/h2o-py-mlflow-flavor/DESCRIPTION.rst +++ b/h2o-py-mlflow-flavor/DESCRIPTION.rst @@ -8,14 +8,14 @@ Logging Models to MLFlow Registry The model that was trained with H2O-3 runtime can be exported to MLFlow registry with `log_model` function.: - .. code-block:: python + .. code-block:: Python -import mlflow -import h2o*mlflow*flavor + import mlflow + import h2o*mlflow*flavor -mlflow.set*tracking*uri("http://127.0.0.1:8080") -h2o_model = ... training phase ... -with mlflow.start*run(run*name="myrun") as run: + mlflow.set*tracking*uri("http://127.0.0.1:8080") + h2o_model = ... training phase ... + with mlflow.start*run(run*name="myrun") as run: h2o\_mlflow\_flavor.log\_model(h2o\_model=h2o\_model, artifact\_path="folder", model\_type="MOJO", @@ -34,13 +34,11 @@ Compared to `log_model` functions of the other flavors being a part of MLFlow, t * **extra*prediction*args** A list of extra arguments for java scoring process. Possible values: - \* ``\-\-setConvertInvalidNum`` \- The scoring process will convert invalid numbers to NA. + * ``\-\-setConvertInvalidNum`` \- The scoring process will convert invalid numbers to NA. - \* ``\-\-predictContributions`` \- The scoring process will Return also Shapley values a long with the predictions. + * ``\-\-predictContributions`` \- The scoring process will Return also Shapley values a long with the predictions. Model must support that Shapley values, otherwise scoring process will throw an error. - Model must support that Shapley values, otherwise scoring process will throw an error. - - \* ``\-\-predictCalibrated`` \- The scoring process will also return calibrated prediction values. + * ``\-\-predictCalibrated`` \- The scoring process will also return calibrated prediction values. The `save*model` function that persists h2o binary model to MOJO or POJO has the same signature as the `log*model` function. @@ -79,23 +77,23 @@ from the training dataset of H2O binary model. It takes following parameters: The functions can be utilized as follows: - .. code-block:: python + .. code-block:: Python -import mlflow -import h2o*mlflow*flavor -mlflow.set*tracking*uri("http://127.0.0.1:8080") + import mlflow + import h2o_mlflow_flavor + mlflow.set*tracking*uri("http://127.0.0.1:8080") -h2o_model = ... training phase ... + h2o_model = ... training phase ... -with mlflow.start*run(run*name="myrun") as run: - mlflow.log\_params(h2o\_mlflow\_flavor.get\_params(h2o\_model)) - mlflow.log\_metrics(h2o\_mlflow\_flavor.get\_metrics(h2o\_model)) - input\_example = h2o\_mlflow\_flavor.get\_input\_example(h2o\_model) - h2o\_mlflow\_flavor.log\_model(h2o\_model=h2o\_model, - input\_example=input\_example, - artifact\_path="folder", - model\_type="MOJO", - extra\_prediction\_args=["\-\-predictCalibrated"]) + with mlflow.start*run(run*name="myrun") as run: + mlflow.log\_params(h2o\_mlflow\_flavor.get\_params(h2o\_model)) + mlflow.log\_metrics(h2o\_mlflow\_flavor.get\_metrics(h2o\_model)) + input\_example = h2o\_mlflow\_flavor.get\_input\_example(h2o\_model) + h2o\_mlflow\_flavor.log\_model(h2o\_model=h2o\_model, + input\_example=input\_example, + artifact\_path="folder", + model\_type="MOJO", + extra\_prediction\_args=["\-\-predictCalibrated"]) Model Scoring @@ -115,14 +113,14 @@ predictions on Pandas dataframe and takes the following parameters: The object for scoring could be obtained also via the `pyfunc` flavor as follows: - .. code-block:: python + .. code-block:: Python -import mlflow -mlflow.set_tracking_uri("http://127.0.0.1:8080") + import mlflow + mlflow.set_tracking_uri("http://127.0.0.1:8080") -logged_model = 'runs:/9a42265cf0ef484c905b02afb8fe6246/iris' -loaded_model = mlflow.pyfunc.load_model(logged_model) + logged_model = 'runs:/9a42265cf0ef484c905b02afb8fe6246/iris' + loaded_model = mlflow.pyfunc.load_model(logged_model) -import pandas as pd -data = pd.read_csv("http://h2o-public-test-data.s3.amazonaws.com/smalldata/iris/iris_wheader.csv") -loaded_model.predict(data) + import pandas as pd + data = pd.read_csv("http://h2o-public-test-data.s3.amazonaws.com/smalldata/iris/iris_wheader.csv") + loaded_model.predict(data) From f288e551c508f0084bf353fffceb3b74239c4c81 Mon Sep 17 00:00:00 2001 From: Marek Novotny Date: Mon, 13 Nov 2023 15:42:52 +0100 Subject: [PATCH 24/43] update description --- h2o-py-mlflow-flavor/DESCRIPTION.rst | 85 +++++++++++++--------------- 1 file changed, 39 insertions(+), 46 deletions(-) diff --git a/h2o-py-mlflow-flavor/DESCRIPTION.rst b/h2o-py-mlflow-flavor/DESCRIPTION.rst index 24ddad582753..5693833171ca 100644 --- a/h2o-py-mlflow-flavor/DESCRIPTION.rst +++ b/h2o-py-mlflow-flavor/DESCRIPTION.rst @@ -8,39 +8,35 @@ Logging Models to MLFlow Registry The model that was trained with H2O-3 runtime can be exported to MLFlow registry with `log_model` function.: - .. code-block:: Python +.. code-block:: Python import mlflow - import h2o*mlflow*flavor + import h2o_mlflow_flavor - mlflow.set*tracking*uri("http://127.0.0.1:8080") + mlflow.set_tracking_uri("http://127.0.0.1:8080") + h2o_model = ... training phase ... - with mlflow.start*run(run*name="myrun") as run: - h2o\_mlflow\_flavor.log\_model(h2o\_model=h2o\_model, - artifact\_path="folder", - model\_type="MOJO", - extra\_prediction\_args=["\-\-predictCalibrated"]) + + with mlflow.start_run(run_name="myrun") as run: + h2o_mlflow_flavor.log_model(h2o_model=h2o_model, + artifact_path="folder", + model_type="MOJO", + extra_prediction_args=["--predictCalibrated"]) Compared to `log_model` functions of the other flavors being a part of MLFlow, this function has two extra arguments: -* **model_type** - It indicates whether the model should be exported as - - [MOJO](https://docs.h2o.ai/h2o/latest\-stable/h2o\-docs/mojo\-quickstart.html#what\-is\-a\-mojo) - - or [POJO](https://docs.h2o.ai/h2o/latest\-stable/h2o\-docs/pojo\-quickstart.html#what\-is\-a\-pojo). - +* **model_type** - It indicates whether the model should be exported as + `MOJO `_ + or `POJO `_. The default value is `MOJO`. -* **extra*prediction*args** A list of extra arguments for java scoring process. Possible values: - - * ``\-\-setConvertInvalidNum`` \- The scoring process will convert invalid numbers to NA. - - * ``\-\-predictContributions`` \- The scoring process will Return also Shapley values a long with the predictions. Model must support that Shapley values, otherwise scoring process will throw an error. - - * ``\-\-predictCalibrated`` \- The scoring process will also return calibrated prediction values. +* **extra_prediction_args** A list of extra arguments for java scoring process. Possible values: + * ``--setConvertInvalidNum`` - The scoring process will convert invalid numbers to NA. + * ``--predictContributions`` - The scoring process will Return also Shapley values a long with the predictions. Model must support that Shapley values, otherwise scoring process will throw an error. + * ``--predictCalibrated`` - The scoring process will also return calibrated prediction values. -The `save*model` function that persists h2o binary model to MOJO or POJO has the same signature as the `log*model` function. +The `save_model` function that persists h2o binary model to MOJO or POJO has the same signature as the `log_model` function. Extracting Information about Model ---------------------------------- @@ -51,49 +47,46 @@ The flavor offers several functions to extract information about the model. takes following parameters: - * ``h2o\_model`` \- An H2O binary model. - - * ``metric\_type`` \- The type of metrics. Possible values are "training", "validation", "cross\_validation". - + * ``h2o_model`` - An H2O binary model. + * ``metric_type`` - The type of metrics. Possible values are "training", "validation", "cross\_validation". If parameter is not specified, metrics for all types are returned. -* ``get_params(h2o*model)`` - Extracts training parameters for the H2O binary model. It returns dictionary and expects one +* ``get_params(h2o_model)`` - Extracts training parameters for the H2O binary model. It returns dictionary and expects one parameter: -parameter: + * ``h2o_model`` - An H2O binary model. - * ``h2o\_model`` \- An H2O binary model. - -* ``get_input_example(h2o*model, number*of*records=5, relevant*columns_only=True)`` - Creates an example Pandas dataset +* ``get_input_example(h2o_model, number_of_records=5, relevant_columns_only=True)`` - Creates an example Pandas dataset from the training dataset of H2O binary model. It takes following parameters: - * ``h2o\_model`` \- An H2O binary model. + * ``h2o_model`` - An H2O binary model. - * ``number\_of\_records`` \- A number of records that will be extracted from the training dataset. + * ``number_of_records`` - A number of records that will be extracted from the training dataset. - * ``relevant\_columns\_only`` \- A flag indicating whether the output dataset should contain only columns required by + * ``relevant_columns_only`` - A flag indicating whether the output dataset should contain only columns required by the model. Defaults to ``True``. The functions can be utilized as follows: - .. code-block:: Python +.. code-block:: Python import mlflow import h2o_mlflow_flavor - mlflow.set*tracking*uri("http://127.0.0.1:8080") + + mlflow.set_tracking_uri("http://127.0.0.1:8080") h2o_model = ... training phase ... - with mlflow.start*run(run*name="myrun") as run: - mlflow.log\_params(h2o\_mlflow\_flavor.get\_params(h2o\_model)) - mlflow.log\_metrics(h2o\_mlflow\_flavor.get\_metrics(h2o\_model)) - input\_example = h2o\_mlflow\_flavor.get\_input\_example(h2o\_model) - h2o\_mlflow\_flavor.log\_model(h2o\_model=h2o\_model, - input\_example=input\_example, - artifact\_path="folder", - model\_type="MOJO", - extra\_prediction\_args=["\-\-predictCalibrated"]) + with mlflow.start_run(run_name="myrun") as run: + mlflow.log_params(h2o_mlflow_flavor.get_params(h2o_model)) + mlflow.log_metrics(h2o_mlflow_flavor.get_metrics(h2o_model)) + input_example = h2o_mlflow_flavor.get_input_example(h2o_model) + h2o_mlflow_flavor.log_model(h2o_model=h2o_model, + input_example=input_example, + artifact_path="folder", + model_type="MOJO", + extra_prediction_args=["--predictCalibrated"]) Model Scoring @@ -103,7 +96,7 @@ After a model obtained from the model registry, the model doesn't require h2o ru that model requires is a ``h2o-gemodel.jar`` which was persisted with the model during saving procedure. -The model could be loaded by the function ``load*model(model*uri, dst_path=None)``. It returns an objecting making +The model could be loaded by the function ``load_model(model_uri, dst_path=None)``. It returns an objecting making predictions on Pandas dataframe and takes the following parameters: From 35e81945f46658c57258e8ea288c4d1ee0e19236 Mon Sep 17 00:00:00 2001 From: Marek Novotny Date: Mon, 13 Nov 2023 15:48:32 +0100 Subject: [PATCH 25/43] update description --- h2o-py-mlflow-flavor/DESCRIPTION.rst | 17 ++++------------- 1 file changed, 4 insertions(+), 13 deletions(-) diff --git a/h2o-py-mlflow-flavor/DESCRIPTION.rst b/h2o-py-mlflow-flavor/DESCRIPTION.rst index 5693833171ca..dc31427c237e 100644 --- a/h2o-py-mlflow-flavor/DESCRIPTION.rst +++ b/h2o-py-mlflow-flavor/DESCRIPTION.rst @@ -26,12 +26,12 @@ The model that was trained with H2O-3 runtime can be exported to MLFlow registry Compared to `log_model` functions of the other flavors being a part of MLFlow, this function has two extra arguments: -* **model_type** - It indicates whether the model should be exported as +* ``model_type`` - It indicates whether the model should be exported as `MOJO `_ or `POJO `_. The default value is `MOJO`. -* **extra_prediction_args** A list of extra arguments for java scoring process. Possible values: +* ``extra_prediction_args`` - A list of extra arguments for java scoring process. Possible values: * ``--setConvertInvalidNum`` - The scoring process will convert invalid numbers to NA. * ``--predictContributions`` - The scoring process will Return also Shapley values a long with the predictions. Model must support that Shapley values, otherwise scoring process will throw an error. * ``--predictCalibrated`` - The scoring process will also return calibrated prediction values. @@ -43,26 +43,17 @@ Extracting Information about Model The flavor offers several functions to extract information about the model. -* `get_metrics(h2o*model, metric_type=None)` - Extracts metrics from the trained H2O binary model. It returns dictionary and - -takes following parameters: - +* `get_metrics(h2o*model, metric_type=None)` - Extracts metrics from the trained H2O binary model. It returns dictionary and takes following parameters: * ``h2o_model`` - An H2O binary model. * ``metric_type`` - The type of metrics. Possible values are "training", "validation", "cross\_validation". If parameter is not specified, metrics for all types are returned. * ``get_params(h2o_model)`` - Extracts training parameters for the H2O binary model. It returns dictionary and expects one parameter: - * ``h2o_model`` - An H2O binary model. -* ``get_input_example(h2o_model, number_of_records=5, relevant_columns_only=True)`` - Creates an example Pandas dataset - -from the training dataset of H2O binary model. It takes following parameters: - +* ``get_input_example(h2o_model, number_of_records=5, relevant_columns_only=True)`` - Creates an example Pandas dataset from the training dataset of H2O binary model. It takes following parameters: * ``h2o_model`` - An H2O binary model. - * ``number_of_records`` - A number of records that will be extracted from the training dataset. - * ``relevant_columns_only`` - A flag indicating whether the output dataset should contain only columns required by the model. Defaults to ``True``. From 06cca19b9929e37032a38e15e69dd9304bc02741 Mon Sep 17 00:00:00 2001 From: Marek Novotny Date: Mon, 13 Nov 2023 15:51:33 +0100 Subject: [PATCH 26/43] update description --- h2o-py-mlflow-flavor/DESCRIPTION.rst | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/h2o-py-mlflow-flavor/DESCRIPTION.rst b/h2o-py-mlflow-flavor/DESCRIPTION.rst index dc31427c237e..c8df6c3ca706 100644 --- a/h2o-py-mlflow-flavor/DESCRIPTION.rst +++ b/h2o-py-mlflow-flavor/DESCRIPTION.rst @@ -26,12 +26,12 @@ The model that was trained with H2O-3 runtime can be exported to MLFlow registry Compared to `log_model` functions of the other flavors being a part of MLFlow, this function has two extra arguments: -* ``model_type`` - It indicates whether the model should be exported as +* ``model_type`` - It indicates whether the model should be exported as `MOJO `_ or `POJO `_. The default value is `MOJO`. -* ``extra_prediction_args`` - A list of extra arguments for java scoring process. Possible values: +* ``extra_prediction_args`` - A list of extra arguments for java scoring process. Possible values: * ``--setConvertInvalidNum`` - The scoring process will convert invalid numbers to NA. * ``--predictContributions`` - The scoring process will Return also Shapley values a long with the predictions. Model must support that Shapley values, otherwise scoring process will throw an error. * ``--predictCalibrated`` - The scoring process will also return calibrated prediction values. @@ -43,9 +43,9 @@ Extracting Information about Model The flavor offers several functions to extract information about the model. -* `get_metrics(h2o*model, metric_type=None)` - Extracts metrics from the trained H2O binary model. It returns dictionary and takes following parameters: +* ``get_metrics(h2o_model, metric_type=None)`` - Extracts metrics from the trained H2O binary model. It returns dictionary and takes following parameters: * ``h2o_model`` - An H2O binary model. - * ``metric_type`` - The type of metrics. Possible values are "training", "validation", "cross\_validation". + * ``metric_type`` - The type of metrics. Possible values are "training", "validation", "cross_validation". If parameter is not specified, metrics for all types are returned. * ``get_params(h2o_model)`` - Extracts training parameters for the H2O binary model. It returns dictionary and expects one parameter: From e2c66801bba054c2f245ddb5e391b999fd01fc20 Mon Sep 17 00:00:00 2001 From: Marek Novotny Date: Mon, 13 Nov 2023 15:53:13 +0100 Subject: [PATCH 27/43] update description --- h2o-py-mlflow-flavor/DESCRIPTION.rst | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/h2o-py-mlflow-flavor/DESCRIPTION.rst b/h2o-py-mlflow-flavor/DESCRIPTION.rst index c8df6c3ca706..f25c4f168a15 100644 --- a/h2o-py-mlflow-flavor/DESCRIPTION.rst +++ b/h2o-py-mlflow-flavor/DESCRIPTION.rst @@ -26,12 +26,10 @@ The model that was trained with H2O-3 runtime can be exported to MLFlow registry Compared to `log_model` functions of the other flavors being a part of MLFlow, this function has two extra arguments: -* ``model_type`` - It indicates whether the model should be exported as - `MOJO `_ - or `POJO `_. - The default value is `MOJO`. +- ``model_type`` - It indicates whether the model should be exported as `MOJO `_ +or `POJO `_. The default value is `MOJO`. -* ``extra_prediction_args`` - A list of extra arguments for java scoring process. Possible values: +- ``extra_prediction_args`` - A list of extra arguments for java scoring process. Possible values: * ``--setConvertInvalidNum`` - The scoring process will convert invalid numbers to NA. * ``--predictContributions`` - The scoring process will Return also Shapley values a long with the predictions. Model must support that Shapley values, otherwise scoring process will throw an error. * ``--predictCalibrated`` - The scoring process will also return calibrated prediction values. @@ -43,15 +41,15 @@ Extracting Information about Model The flavor offers several functions to extract information about the model. -* ``get_metrics(h2o_model, metric_type=None)`` - Extracts metrics from the trained H2O binary model. It returns dictionary and takes following parameters: +- ``get_metrics(h2o_model, metric_type=None)`` - Extracts metrics from the trained H2O binary model. It returns dictionary and takes following parameters: * ``h2o_model`` - An H2O binary model. * ``metric_type`` - The type of metrics. Possible values are "training", "validation", "cross_validation". If parameter is not specified, metrics for all types are returned. -* ``get_params(h2o_model)`` - Extracts training parameters for the H2O binary model. It returns dictionary and expects one parameter: +- ``get_params(h2o_model)`` - Extracts training parameters for the H2O binary model. It returns dictionary and expects one parameter: * ``h2o_model`` - An H2O binary model. -* ``get_input_example(h2o_model, number_of_records=5, relevant_columns_only=True)`` - Creates an example Pandas dataset from the training dataset of H2O binary model. It takes following parameters: +- ``get_input_example(h2o_model, number_of_records=5, relevant_columns_only=True)`` - Creates an example Pandas dataset from the training dataset of H2O binary model. It takes following parameters: * ``h2o_model`` - An H2O binary model. * ``number_of_records`` - A number of records that will be extracted from the training dataset. * ``relevant_columns_only`` - A flag indicating whether the output dataset should contain only columns required by From 31c8fb1671875723f0d07b9dc0f2a3b9b106c706 Mon Sep 17 00:00:00 2001 From: Marek Novotny Date: Mon, 13 Nov 2023 15:54:17 +0100 Subject: [PATCH 28/43] update description --- h2o-py-mlflow-flavor/DESCRIPTION.rst | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/h2o-py-mlflow-flavor/DESCRIPTION.rst b/h2o-py-mlflow-flavor/DESCRIPTION.rst index f25c4f168a15..4a70301a6cd7 100644 --- a/h2o-py-mlflow-flavor/DESCRIPTION.rst +++ b/h2o-py-mlflow-flavor/DESCRIPTION.rst @@ -28,7 +28,6 @@ Compared to `log_model` functions of the other flavors being a part of MLFlow, t - ``model_type`` - It indicates whether the model should be exported as `MOJO `_ or `POJO `_. The default value is `MOJO`. - - ``extra_prediction_args`` - A list of extra arguments for java scoring process. Possible values: * ``--setConvertInvalidNum`` - The scoring process will convert invalid numbers to NA. * ``--predictContributions`` - The scoring process will Return also Shapley values a long with the predictions. Model must support that Shapley values, otherwise scoring process will throw an error. @@ -45,16 +44,12 @@ The flavor offers several functions to extract information about the model. * ``h2o_model`` - An H2O binary model. * ``metric_type`` - The type of metrics. Possible values are "training", "validation", "cross_validation". If parameter is not specified, metrics for all types are returned. - - ``get_params(h2o_model)`` - Extracts training parameters for the H2O binary model. It returns dictionary and expects one parameter: * ``h2o_model`` - An H2O binary model. - - ``get_input_example(h2o_model, number_of_records=5, relevant_columns_only=True)`` - Creates an example Pandas dataset from the training dataset of H2O binary model. It takes following parameters: * ``h2o_model`` - An H2O binary model. * ``number_of_records`` - A number of records that will be extracted from the training dataset. - * ``relevant_columns_only`` - A flag indicating whether the output dataset should contain only columns required by - - the model. Defaults to ``True``. + * ``relevant_columns_only`` - A flag indicating whether the output dataset should contain only columns required by the model. Defaults to ``True``. The functions can be utilized as follows: From 1b3ed2d8324a1b414de8ce372f7c02ca7bb09cc1 Mon Sep 17 00:00:00 2001 From: Marek Novotny Date: Mon, 13 Nov 2023 15:55:25 +0100 Subject: [PATCH 29/43] update description --- h2o-py-mlflow-flavor/DESCRIPTION.rst | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/h2o-py-mlflow-flavor/DESCRIPTION.rst b/h2o-py-mlflow-flavor/DESCRIPTION.rst index 4a70301a6cd7..7375b03568e5 100644 --- a/h2o-py-mlflow-flavor/DESCRIPTION.rst +++ b/h2o-py-mlflow-flavor/DESCRIPTION.rst @@ -41,15 +41,15 @@ Extracting Information about Model The flavor offers several functions to extract information about the model. - ``get_metrics(h2o_model, metric_type=None)`` - Extracts metrics from the trained H2O binary model. It returns dictionary and takes following parameters: - * ``h2o_model`` - An H2O binary model. - * ``metric_type`` - The type of metrics. Possible values are "training", "validation", "cross_validation". + - ``h2o_model`` - An H2O binary model. + - ``metric_type`` - The type of metrics. Possible values are "training", "validation", "cross_validation". If parameter is not specified, metrics for all types are returned. - ``get_params(h2o_model)`` - Extracts training parameters for the H2O binary model. It returns dictionary and expects one parameter: - * ``h2o_model`` - An H2O binary model. + - ``h2o_model`` - An H2O binary model. - ``get_input_example(h2o_model, number_of_records=5, relevant_columns_only=True)`` - Creates an example Pandas dataset from the training dataset of H2O binary model. It takes following parameters: - * ``h2o_model`` - An H2O binary model. - * ``number_of_records`` - A number of records that will be extracted from the training dataset. - * ``relevant_columns_only`` - A flag indicating whether the output dataset should contain only columns required by the model. Defaults to ``True``. + - ``h2o_model`` - An H2O binary model. + - ``number_of_records`` - A number of records that will be extracted from the training dataset. + -``relevant_columns_only`` - A flag indicating whether the output dataset should contain only columns required by the model. Defaults to ``True``. The functions can be utilized as follows: From 7c33c80b37049ea8141f70b2f0f99f5376a1927e Mon Sep 17 00:00:00 2001 From: Marek Novotny Date: Mon, 13 Nov 2023 15:56:14 +0100 Subject: [PATCH 30/43] update description --- h2o-py-mlflow-flavor/DESCRIPTION.rst | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/h2o-py-mlflow-flavor/DESCRIPTION.rst b/h2o-py-mlflow-flavor/DESCRIPTION.rst index 7375b03568e5..3b8b3cbc457f 100644 --- a/h2o-py-mlflow-flavor/DESCRIPTION.rst +++ b/h2o-py-mlflow-flavor/DESCRIPTION.rst @@ -29,9 +29,9 @@ Compared to `log_model` functions of the other flavors being a part of MLFlow, t - ``model_type`` - It indicates whether the model should be exported as `MOJO `_ or `POJO `_. The default value is `MOJO`. - ``extra_prediction_args`` - A list of extra arguments for java scoring process. Possible values: - * ``--setConvertInvalidNum`` - The scoring process will convert invalid numbers to NA. - * ``--predictContributions`` - The scoring process will Return also Shapley values a long with the predictions. Model must support that Shapley values, otherwise scoring process will throw an error. - * ``--predictCalibrated`` - The scoring process will also return calibrated prediction values. + - ``--setConvertInvalidNum`` - The scoring process will convert invalid numbers to NA. + - ``--predictContributions`` - The scoring process will Return also Shapley values a long with the predictions. Model must support that Shapley values, otherwise scoring process will throw an error. + - ``--predictCalibrated`` - The scoring process will also return calibrated prediction values. The `save_model` function that persists h2o binary model to MOJO or POJO has the same signature as the `log_model` function. @@ -84,9 +84,8 @@ The model could be loaded by the function ``load_model(model_uri, dst_path=None) predictions on Pandas dataframe and takes the following parameters: -* ``model_uri`` - An unique identifier of the model within MLFlow registry. - -* ``dst_path`` - (Optional) A local filesystem path for downloading the persisted form of the model. +- ``model_uri`` - An unique identifier of the model within MLFlow registry. +- ``dst_path`` - (Optional) A local filesystem path for downloading the persisted form of the model. The object for scoring could be obtained also via the `pyfunc` flavor as follows: From bd4a5477a1e0718da193c2129866a4b166a58b2d Mon Sep 17 00:00:00 2001 From: Marek Novotny Date: Mon, 13 Nov 2023 15:56:53 +0100 Subject: [PATCH 31/43] update description --- h2o-py-mlflow-flavor/DESCRIPTION.rst | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/h2o-py-mlflow-flavor/DESCRIPTION.rst b/h2o-py-mlflow-flavor/DESCRIPTION.rst index 3b8b3cbc457f..fdf5fe160e7a 100644 --- a/h2o-py-mlflow-flavor/DESCRIPTION.rst +++ b/h2o-py-mlflow-flavor/DESCRIPTION.rst @@ -77,11 +77,8 @@ Model Scoring ------------- After a model obtained from the model registry, the model doesn't require h2o runtime for ability to score. The only thing - -that model requires is a ``h2o-gemodel.jar`` which was persisted with the model during saving procedure. - +that model requires is a ``h2o-gemodel.jar`` which was persisted with the model during saving procedure. The model could be loaded by the function ``load_model(model_uri, dst_path=None)``. It returns an objecting making - predictions on Pandas dataframe and takes the following parameters: - ``model_uri`` - An unique identifier of the model within MLFlow registry. @@ -89,7 +86,7 @@ predictions on Pandas dataframe and takes the following parameters: The object for scoring could be obtained also via the `pyfunc` flavor as follows: - .. code-block:: Python +.. code-block:: Python import mlflow mlflow.set_tracking_uri("http://127.0.0.1:8080") From c816cc7e2d354678e432c8fe3c173ffc3658cb95 Mon Sep 17 00:00:00 2001 From: Marek Novotny Date: Mon, 13 Nov 2023 15:59:43 +0100 Subject: [PATCH 32/43] update description --- h2o-py-mlflow-flavor/DESCRIPTION.rst | 46 ++++++++++++++++++---------- 1 file changed, 29 insertions(+), 17 deletions(-) diff --git a/h2o-py-mlflow-flavor/DESCRIPTION.rst b/h2o-py-mlflow-flavor/DESCRIPTION.rst index fdf5fe160e7a..32b394d45b10 100644 --- a/h2o-py-mlflow-flavor/DESCRIPTION.rst +++ b/h2o-py-mlflow-flavor/DESCRIPTION.rst @@ -26,12 +26,16 @@ The model that was trained with H2O-3 runtime can be exported to MLFlow registry Compared to `log_model` functions of the other flavors being a part of MLFlow, this function has two extra arguments: -- ``model_type`` - It indicates whether the model should be exported as `MOJO `_ +* ``model_type`` - It indicates whether the model should be exported as `MOJO `_ or `POJO `_. The default value is `MOJO`. -- ``extra_prediction_args`` - A list of extra arguments for java scoring process. Possible values: - - ``--setConvertInvalidNum`` - The scoring process will convert invalid numbers to NA. - - ``--predictContributions`` - The scoring process will Return also Shapley values a long with the predictions. Model must support that Shapley values, otherwise scoring process will throw an error. - - ``--predictCalibrated`` - The scoring process will also return calibrated prediction values. + +* ``extra_prediction_args`` - A list of extra arguments for java scoring process. Possible values: + + * ``--setConvertInvalidNum`` - The scoring process will convert invalid numbers to NA. + + * ``--predictContributions`` - The scoring process will Return also Shapley values a long with the predictions. Model must support that Shapley values, otherwise scoring process will throw an error. + + * ``--predictCalibrated`` - The scoring process will also return calibrated prediction values. The `save_model` function that persists h2o binary model to MOJO or POJO has the same signature as the `log_model` function. @@ -40,16 +44,23 @@ Extracting Information about Model The flavor offers several functions to extract information about the model. -- ``get_metrics(h2o_model, metric_type=None)`` - Extracts metrics from the trained H2O binary model. It returns dictionary and takes following parameters: - - ``h2o_model`` - An H2O binary model. - - ``metric_type`` - The type of metrics. Possible values are "training", "validation", "cross_validation". - If parameter is not specified, metrics for all types are returned. -- ``get_params(h2o_model)`` - Extracts training parameters for the H2O binary model. It returns dictionary and expects one parameter: - - ``h2o_model`` - An H2O binary model. -- ``get_input_example(h2o_model, number_of_records=5, relevant_columns_only=True)`` - Creates an example Pandas dataset from the training dataset of H2O binary model. It takes following parameters: - - ``h2o_model`` - An H2O binary model. - - ``number_of_records`` - A number of records that will be extracted from the training dataset. - -``relevant_columns_only`` - A flag indicating whether the output dataset should contain only columns required by the model. Defaults to ``True``. +* ``get_metrics(h2o_model, metric_type=None)`` - Extracts metrics from the trained H2O binary model. It returns dictionary and takes following parameters: + + * ``h2o_model`` - An H2O binary model. + + * ``metric_type`` - The type of metrics. Possible values are "training", "validation", "cross_validation". If parameter is not specified, metrics for all types are returned. + +* ``get_params(h2o_model)`` - Extracts training parameters for the H2O binary model. It returns dictionary and expects one parameter: + + * ``h2o_model`` - An H2O binary model. + +* ``get_input_example(h2o_model, number_of_records=5, relevant_columns_only=True)`` - Creates an example Pandas dataset from the training dataset of H2O binary model. It takes following parameters: + + * ``h2o_model`` - An H2O binary model. + + * ``number_of_records`` - A number of records that will be extracted from the training dataset. + + * ``relevant_columns_only`` - A flag indicating whether the output dataset should contain only columns required by the model. Defaults to ``True``. The functions can be utilized as follows: @@ -81,8 +92,9 @@ that model requires is a ``h2o-gemodel.jar`` which was persisted with the model The model could be loaded by the function ``load_model(model_uri, dst_path=None)``. It returns an objecting making predictions on Pandas dataframe and takes the following parameters: -- ``model_uri`` - An unique identifier of the model within MLFlow registry. -- ``dst_path`` - (Optional) A local filesystem path for downloading the persisted form of the model. +* ``model_uri`` - An unique identifier of the model within MLFlow registry. + +* ``dst_path`` - (Optional) A local filesystem path for downloading the persisted form of the model. The object for scoring could be obtained also via the `pyfunc` flavor as follows: From 919126509f8a9c483fb222f4e1d165497de7c61a Mon Sep 17 00:00:00 2001 From: Marek Novotny Date: Mon, 13 Nov 2023 16:01:12 +0100 Subject: [PATCH 33/43] update description --- h2o-py-mlflow-flavor/DESCRIPTION.rst | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/h2o-py-mlflow-flavor/DESCRIPTION.rst b/h2o-py-mlflow-flavor/DESCRIPTION.rst index 32b394d45b10..31fa83ff7245 100644 --- a/h2o-py-mlflow-flavor/DESCRIPTION.rst +++ b/h2o-py-mlflow-flavor/DESCRIPTION.rst @@ -31,11 +31,11 @@ or `POJO Date: Mon, 13 Nov 2023 16:02:12 +0100 Subject: [PATCH 34/43] update description --- h2o-py-mlflow-flavor/DESCRIPTION.rst | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/h2o-py-mlflow-flavor/DESCRIPTION.rst b/h2o-py-mlflow-flavor/DESCRIPTION.rst index 31fa83ff7245..f86a6d4690b8 100644 --- a/h2o-py-mlflow-flavor/DESCRIPTION.rst +++ b/h2o-py-mlflow-flavor/DESCRIPTION.rst @@ -19,9 +19,9 @@ The model that was trained with H2O-3 runtime can be exported to MLFlow registry with mlflow.start_run(run_name="myrun") as run: h2o_mlflow_flavor.log_model(h2o_model=h2o_model, - artifact_path="folder", - model_type="MOJO", - extra_prediction_args=["--predictCalibrated"]) + artifact_path="folder", + model_type="MOJO", + extra_prediction_args=["--predictCalibrated"]) Compared to `log_model` functions of the other flavors being a part of MLFlow, this function has two extra arguments: @@ -78,10 +78,10 @@ The functions can be utilized as follows: mlflow.log_metrics(h2o_mlflow_flavor.get_metrics(h2o_model)) input_example = h2o_mlflow_flavor.get_input_example(h2o_model) h2o_mlflow_flavor.log_model(h2o_model=h2o_model, - input_example=input_example, - artifact_path="folder", - model_type="MOJO", - extra_prediction_args=["--predictCalibrated"]) + input_example=input_example, + artifact_path="folder", + model_type="MOJO", + extra_prediction_args=["--predictCalibrated"]) Model Scoring From 76de6bb53049e0421f00571c91ab0192828d42bc Mon Sep 17 00:00:00 2001 From: Marek Novotny Date: Mon, 13 Nov 2023 16:02:54 +0100 Subject: [PATCH 35/43] update description --- h2o-py-mlflow-flavor/DESCRIPTION.rst | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/h2o-py-mlflow-flavor/DESCRIPTION.rst b/h2o-py-mlflow-flavor/DESCRIPTION.rst index f86a6d4690b8..a0d66b4bed8d 100644 --- a/h2o-py-mlflow-flavor/DESCRIPTION.rst +++ b/h2o-py-mlflow-flavor/DESCRIPTION.rst @@ -19,9 +19,9 @@ The model that was trained with H2O-3 runtime can be exported to MLFlow registry with mlflow.start_run(run_name="myrun") as run: h2o_mlflow_flavor.log_model(h2o_model=h2o_model, - artifact_path="folder", - model_type="MOJO", - extra_prediction_args=["--predictCalibrated"]) + artifact_path="folder", + model_type="MOJO", + extra_prediction_args=["--predictCalibrated"]) Compared to `log_model` functions of the other flavors being a part of MLFlow, this function has two extra arguments: @@ -78,10 +78,10 @@ The functions can be utilized as follows: mlflow.log_metrics(h2o_mlflow_flavor.get_metrics(h2o_model)) input_example = h2o_mlflow_flavor.get_input_example(h2o_model) h2o_mlflow_flavor.log_model(h2o_model=h2o_model, - input_example=input_example, - artifact_path="folder", - model_type="MOJO", - extra_prediction_args=["--predictCalibrated"]) + input_example=input_example, + artifact_path="folder", + model_type="MOJO", + extra_prediction_args=["--predictCalibrated"]) Model Scoring From ad5be38889b69b55875a7a0aed7c0e1676f23ea2 Mon Sep 17 00:00:00 2001 From: Marek Novotny Date: Mon, 13 Nov 2023 16:03:39 +0100 Subject: [PATCH 36/43] update description --- h2o-py-mlflow-flavor/DESCRIPTION.rst | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/h2o-py-mlflow-flavor/DESCRIPTION.rst b/h2o-py-mlflow-flavor/DESCRIPTION.rst index a0d66b4bed8d..cf123bf23506 100644 --- a/h2o-py-mlflow-flavor/DESCRIPTION.rst +++ b/h2o-py-mlflow-flavor/DESCRIPTION.rst @@ -19,9 +19,9 @@ The model that was trained with H2O-3 runtime can be exported to MLFlow registry with mlflow.start_run(run_name="myrun") as run: h2o_mlflow_flavor.log_model(h2o_model=h2o_model, - artifact_path="folder", - model_type="MOJO", - extra_prediction_args=["--predictCalibrated"]) + artifact_path="folder", + model_type="MOJO", + extra_prediction_args=["--predictCalibrated"]) Compared to `log_model` functions of the other flavors being a part of MLFlow, this function has two extra arguments: @@ -78,10 +78,10 @@ The functions can be utilized as follows: mlflow.log_metrics(h2o_mlflow_flavor.get_metrics(h2o_model)) input_example = h2o_mlflow_flavor.get_input_example(h2o_model) h2o_mlflow_flavor.log_model(h2o_model=h2o_model, - input_example=input_example, - artifact_path="folder", - model_type="MOJO", - extra_prediction_args=["--predictCalibrated"]) + input_example=input_example, + artifact_path="folder", + model_type="MOJO", + extra_prediction_args=["--predictCalibrated"]) Model Scoring From df982920f5edcbd12b5549d63b1816ed44b72d32 Mon Sep 17 00:00:00 2001 From: Marek Novotny Date: Mon, 13 Nov 2023 16:04:15 +0100 Subject: [PATCH 37/43] update description --- h2o-py-mlflow-flavor/DESCRIPTION.rst | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/h2o-py-mlflow-flavor/DESCRIPTION.rst b/h2o-py-mlflow-flavor/DESCRIPTION.rst index cf123bf23506..6d7ade192670 100644 --- a/h2o-py-mlflow-flavor/DESCRIPTION.rst +++ b/h2o-py-mlflow-flavor/DESCRIPTION.rst @@ -19,9 +19,9 @@ The model that was trained with H2O-3 runtime can be exported to MLFlow registry with mlflow.start_run(run_name="myrun") as run: h2o_mlflow_flavor.log_model(h2o_model=h2o_model, - artifact_path="folder", - model_type="MOJO", - extra_prediction_args=["--predictCalibrated"]) + artifact_path="folder", + model_type="MOJO", + extra_prediction_args=["--predictCalibrated"]) Compared to `log_model` functions of the other flavors being a part of MLFlow, this function has two extra arguments: @@ -78,10 +78,10 @@ The functions can be utilized as follows: mlflow.log_metrics(h2o_mlflow_flavor.get_metrics(h2o_model)) input_example = h2o_mlflow_flavor.get_input_example(h2o_model) h2o_mlflow_flavor.log_model(h2o_model=h2o_model, - input_example=input_example, - artifact_path="folder", - model_type="MOJO", - extra_prediction_args=["--predictCalibrated"]) + input_example=input_example, + artifact_path="folder", + model_type="MOJO", + extra_prediction_args=["--predictCalibrated"]) Model Scoring From aad61ae62fb503886837b88421ba3b5d88a679f3 Mon Sep 17 00:00:00 2001 From: Marek Novotny Date: Mon, 13 Nov 2023 16:05:05 +0100 Subject: [PATCH 38/43] update description --- h2o-py-mlflow-flavor/DESCRIPTION.rst | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/h2o-py-mlflow-flavor/DESCRIPTION.rst b/h2o-py-mlflow-flavor/DESCRIPTION.rst index 6d7ade192670..c4defa766905 100644 --- a/h2o-py-mlflow-flavor/DESCRIPTION.rst +++ b/h2o-py-mlflow-flavor/DESCRIPTION.rst @@ -26,8 +26,7 @@ The model that was trained with H2O-3 runtime can be exported to MLFlow registry Compared to `log_model` functions of the other flavors being a part of MLFlow, this function has two extra arguments: -* ``model_type`` - It indicates whether the model should be exported as `MOJO `_ -or `POJO `_. The default value is `MOJO`. +* ``model_type`` - It indicates whether the model should be exported as `MOJO `_ or `POJO `_. The default value is `MOJO`. * ``extra_prediction_args`` - A list of extra arguments for java scoring process. Possible values: From 77c43911a2c118a865b0bd28313b18d1f3d3a61c Mon Sep 17 00:00:00 2001 From: Marek Novotny Date: Mon, 13 Nov 2023 16:14:02 +0100 Subject: [PATCH 39/43] Add flavor as self reference --- h2o-py-mlflow-flavor/h2o_mlflow_flavor/__init__.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/h2o-py-mlflow-flavor/h2o_mlflow_flavor/__init__.py b/h2o-py-mlflow-flavor/h2o_mlflow_flavor/__init__.py index cdbfc0fbc511..4d6aa93bf459 100644 --- a/h2o-py-mlflow-flavor/h2o_mlflow_flavor/__init__.py +++ b/h2o-py-mlflow-flavor/h2o_mlflow_flavor/__init__.py @@ -35,6 +35,7 @@ _validate_and_prepare_target_save_path, ) from mlflow.tracking.artifact_utils import _download_artifact_from_uri +from mlflow.utils.requirements_utils import _get_pinned_requirement _logger = logging.getLogger(__name__) @@ -47,7 +48,7 @@ def get_default_pip_requirements(): Calls to :func:`save_model()` and :func:`log_model()` produce a pip environment that, at minimum, contains these requirements. """ - return [] + return [_get_pinned_requirement("h2o_mlflow_flavor")] def get_default_conda_env(): From b9be04a51d6094511d89414be81c70bd47e3aacc Mon Sep 17 00:00:00 2001 From: Marek Novotny Date: Mon, 13 Nov 2023 19:18:53 +0100 Subject: [PATCH 40/43] Update build definition --- h2o-py-mlflow-flavor/build.gradle | 17 +---------------- h2o-py-mlflow-flavor/setup.py | 10 +--------- 2 files changed, 2 insertions(+), 25 deletions(-) diff --git a/h2o-py-mlflow-flavor/build.gradle b/h2o-py-mlflow-flavor/build.gradle index 58dcd1d243d2..2ba20ba9eb6f 100644 --- a/h2o-py-mlflow-flavor/build.gradle +++ b/h2o-py-mlflow-flavor/build.gradle @@ -15,21 +15,6 @@ ext { testsPath = file("tests") } -// -// Create a file with version for Python dist task -// -task createVersionFiles() { - doLast { - file("${buildDir}/h2o_mlflow_flavor/").mkdirs() - File version_file = new File("${buildDir}/h2o_mlflow_flavor/", "version.txt") - version_file.write(PROJECT_VERSION) - - File build_file = new File("${buildDir}/h2o_mlflow_flavor/", "buildinfo.txt") - build_file.write(buildVersion.toString()) - } -} - - task copySrcFiles(type: Copy) { from ("${projectDir}") { include "setup.py" @@ -41,7 +26,7 @@ task copySrcFiles(type: Copy) { into "${buildDir}" } -task buildDist(type: Exec, dependsOn: [createVersionFiles, copySrcFiles]) { +task buildDist(type: Exec, dependsOn: [copySrcFiles]) { workingDir buildDir doFirst { file("${buildDir}/tmp").mkdirs() diff --git a/h2o-py-mlflow-flavor/setup.py b/h2o-py-mlflow-flavor/setup.py index aeb84e7c8d50..5730e35ed4e7 100644 --- a/h2o-py-mlflow-flavor/setup.py +++ b/h2o-py-mlflow-flavor/setup.py @@ -9,11 +9,7 @@ with open(os.path.join(here, 'DESCRIPTION.rst'), encoding='utf-8') as f: long_description = f.read() -version = "0.1.0-SNAPSHOT" -# Get the version from the relevant file -with open(os.path.join(here, 'h2o_mlflow_flavor/version.txt'), encoding='utf-8') as f: - version = f.read() - +version = "0.1.0" packages = find_packages(exclude=["tests*"]) print("Found packages: %r" % packages) @@ -69,10 +65,6 @@ keywords='ML Flow, H2O-3', packages=packages, - package_data={"h2o": [ - "version.txt", # version file - "buildinfo.txt" # buildinfo file - ]}, # run-time dependencies install_requires=["mlflow>=1.29.0"] From fe76b539808f602bb008aac89537e300d5b1619c Mon Sep 17 00:00:00 2001 From: Marek Novotny Date: Tue, 14 Nov 2023 15:13:41 +0100 Subject: [PATCH 41/43] Remove gitignore --- h2o-py-mlflow-flavor/.gitignore | 44 --------------------------------- 1 file changed, 44 deletions(-) delete mode 100644 h2o-py-mlflow-flavor/.gitignore diff --git a/h2o-py-mlflow-flavor/.gitignore b/h2o-py-mlflow-flavor/.gitignore deleted file mode 100644 index 70a02e1541b2..000000000000 --- a/h2o-py-mlflow-flavor/.gitignore +++ /dev/null @@ -1,44 +0,0 @@ -# Backup files -*.~ - -# Byte-compiled / optimized / DLL files -__pycache__/ -*.py[cod] - -# C extensions -*.so - -# Distribution / packaging -bin/ -build/ -develop-eggs/ -dist/ -eggs/ -lib/ -lib64/ -parts/ -sdist/ -var/ -*.egg-info/ -.installed.cfg -*.egg -MANIFEST - -# Tests -tests/results/* -tests/*/results/* -tests/*/*/results/* - -# Installer logs -pip-log.txt -pip-delete-this-directory.txt - -# Unit test / coverage reports -.tox/ -.coverage -.cache -nosetests.xml -coverage.xml - -# Translations -*.mo From 7d39ab4a4c86dc227c6efbcde5ff64b77c095014 Mon Sep 17 00:00:00 2001 From: Marek Novotny Date: Tue, 14 Nov 2023 16:58:08 +0100 Subject: [PATCH 42/43] Just one doc --- h2o-py-mlflow-flavor/README.md | 92 - .../{DESCRIPTION.rst => README.rst} | 0 h2o-py-mlflow-flavor/build.gradle | 3 +- h2o-py-mlflow-flavor/examples/DRF_mojo.ipynb | 1571 ++++++++++++++++- h2o-py-mlflow-flavor/setup.py | 2 +- 5 files changed, 1565 insertions(+), 103 deletions(-) delete mode 100644 h2o-py-mlflow-flavor/README.md rename h2o-py-mlflow-flavor/{DESCRIPTION.rst => README.rst} (100%) diff --git a/h2o-py-mlflow-flavor/README.md b/h2o-py-mlflow-flavor/README.md deleted file mode 100644 index cc41d6b7c939..000000000000 --- a/h2o-py-mlflow-flavor/README.md +++ /dev/null @@ -1,92 +0,0 @@ -# H2O-3 MLFlow Flavor - -A tiny library containing a [MLFlow](https://mlflow.org/) flavor for working with H2O-3 MOJO and POJO models. - -## Logging Models to MLFlow Registry - -The model that was trained with H2O-3 runtime can be exported to MLFlow registry with `log_model` function.: -```python -import mlflow -import h2o_mlflow_flavor -mlflow.set_tracking_uri("http://127.0.0.1:8080") - -h2o_model = ... training phase ... - -with mlflow.start_run(run_name="myrun") as run: - h2o_mlflow_flavor.log_model(h2o_model=h2o_model, - artifact_path="folder", - model_type="MOJO", - extra_prediction_args=["--predictCalibrated"]) -``` - -Compared to `log_model` functions of the other flavors being a part of MLFlow, this function has two extra arguments: - -* **model_type** - It indicates whether the model should be exported as - [MOJO](https://docs.h2o.ai/h2o/latest-stable/h2o-docs/mojo-quickstart.html#what-is-a-mojo) - or [POJO](https://docs.h2o.ai/h2o/latest-stable/h2o-docs/pojo-quickstart.html#what-is-a-pojo). - The default value is `MOJO`. -* **extra_prediction_args** A list of extra arguments for java scoring process. Possible values: - * `--setConvertInvalidNum` - The scoring process will convert invalid numbers to NA. - * `--predictContributions` - The scoring process will Return also Shapley values a long with the predictions. - Model must support that Shapley values, otherwise scoring process will throw an error. - * `--predictCalibrated` - The scoring process will also return calibrated prediction values. - -The `save_model` function that persists h2o binary model to MOJO or POJO has the same signature as the `log_model` function. - -## Extracting Information about Model -The flavor offers several functions to extract information about the model. - -* `get_metrics(h2o_model, metric_type=None)` - Extracts metrics from the trained H2O binary model. It returns dictionary and -takes following parameters: - * `h2o_model` - An H2O binary model. - * `metric_type` - The type of metrics. Possible values are "training", "validation", "cross_validation". - If parameter is not specified, metrics for all types are returned. -* `get_params(h2o_model)` - Extracts training parameters for the H2O binary model. It returns dictionary and expects one -parameter: - * `h2o_model` - An H2O binary model. -* `get_input_example(h2o_model, number_of_records=5, relevant_columns_only=True)` - Creates an example Pandas dataset -from the training dataset of H2O binary model. It takes following parameters: - * `h2o_model` - An H2O binary model. - * `number_of_records` - A number of records that will be extracted from the training dataset. - * `relevant_columns_only` - A flag indicating whether the output dataset should contain only columns required by - the model. Defaults to `True`. - -The functions can be utilized as follows: -```python -import mlflow -import h2o_mlflow_flavor -mlflow.set_tracking_uri("http://127.0.0.1:8080") - -h2o_model = ... training phase ... - -with mlflow.start_run(run_name="myrun") as run: - mlflow.log_params(h2o_mlflow_flavor.get_params(h2o_model)) - mlflow.log_metrics(h2o_mlflow_flavor.get_metrics(h2o_model)) - input_example = h2o_mlflow_flavor.get_input_example(h2o_model) - h2o_mlflow_flavor.log_model(h2o_model=h2o_model, - input_example=input_example, - artifact_path="folder", - model_type="MOJO", - extra_prediction_args=["--predictCalibrated"]) -``` - -## Model Scoring -After a model obtained from the model registry, the model doesn't require h2o runtime for ability to score. The only thing -that model requires is a `h2o-gemodel.jar` which was persisted with the model during saving procedure. - -The model could be loaded by the function `load_model(model_uri, dst_path=None)`. It returns an objecting making -predictions on Pandas dataframe and takes the following parameters: -* `model_uri` - An unique identifier of the model within MLFlow registry. -* `dst_path` - (Optional) A local filesystem path for downloading the persisted form of the model. - -The object for scoring could be obtained also via the `pyfunc` flavor as follows: -```python -import mlflow -mlflow.set_tracking_uri("http://127.0.0.1:8080") - -logged_model = 'runs:/9a42265cf0ef484c905b02afb8fe6246/iris' -loaded_model = mlflow.pyfunc.load_model(logged_model) -import pandas as pd -data = pd.read_csv("http://h2o-public-test-data.s3.amazonaws.com/smalldata/iris/iris_wheader.csv") -loaded_model.predict(data) -``` diff --git a/h2o-py-mlflow-flavor/DESCRIPTION.rst b/h2o-py-mlflow-flavor/README.rst similarity index 100% rename from h2o-py-mlflow-flavor/DESCRIPTION.rst rename to h2o-py-mlflow-flavor/README.rst diff --git a/h2o-py-mlflow-flavor/build.gradle b/h2o-py-mlflow-flavor/build.gradle index 2ba20ba9eb6f..1edc3ccab43e 100644 --- a/h2o-py-mlflow-flavor/build.gradle +++ b/h2o-py-mlflow-flavor/build.gradle @@ -20,8 +20,7 @@ task copySrcFiles(type: Copy) { include "setup.py" include "setup.cfg" include "h2o_mlflow_flavor/**" - include "README.md" - include "DESCRIPTION.rst" + include "README.rst" } into "${buildDir}" } diff --git a/h2o-py-mlflow-flavor/examples/DRF_mojo.ipynb b/h2o-py-mlflow-flavor/examples/DRF_mojo.ipynb index 7327c94f9a0d..b1d825a9a798 100644 --- a/h2o-py-mlflow-flavor/examples/DRF_mojo.ipynb +++ b/h2o-py-mlflow-flavor/examples/DRF_mojo.ipynb @@ -2,10 +2,128 @@ "cells": [ { "cell_type": "code", - "execution_count": null, + "execution_count": 1, "id": "3ded5553", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Checking whether there is an H2O instance running at http://localhost:54321..... not found.\n", + "Attempting to start a local H2O server...\n", + " Java Version: openjdk version \"11.0.21\" 2023-10-17; OpenJDK Runtime Environment Homebrew (build 11.0.21+0); OpenJDK 64-Bit Server VM Homebrew (build 11.0.21+0, mixed mode)\n", + " Starting server from /Users/marek/git2/h2o-3/build/h2o.jar\n", + " Ice root: /var/folders/bz/gzkngzwj2593j90gdmscv89c0000gn/T/tmpqhbfsn1i\n", + " JVM stdout: /var/folders/bz/gzkngzwj2593j90gdmscv89c0000gn/T/tmpqhbfsn1i/h2o_marek_started_from_python.out\n", + " JVM stderr: /var/folders/bz/gzkngzwj2593j90gdmscv89c0000gn/T/tmpqhbfsn1i/h2o_marek_started_from_python.err\n", + " Server is running at http://127.0.0.1:54321\n", + "Connecting to H2O server at http://127.0.0.1:54321 ... successful.\n", + "Warning: Version mismatch. H2O is version 3.44.0.99999, but the h2o-python package is version 3.44.0.1. This is a developer build, please contact your developer. To avoid this error message (not recommended), you can set strict_version_check=False.\n" + ] + }, + { + "data": { + "text/html": [ + "\n", + " \n", + "
\n", + " \n", + " \n", + " \n", + " \n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "
H2O_cluster_uptime:02 secs
H2O_cluster_timezone:Europe/Prague
H2O_data_parsing_timezone:UTC
H2O_cluster_version:3.44.0.99999
H2O_cluster_version_age:17 hours and 36 minutes
H2O_cluster_name:H2O_from_python_marek_u1r8vv
H2O_cluster_total_nodes:1
H2O_cluster_free_memory:8 Gb
H2O_cluster_total_cores:10
H2O_cluster_allowed_cores:10
H2O_cluster_status:locked, healthy
H2O_connection_url:http://127.0.0.1:54321
H2O_connection_proxy:{\"http\": null, \"https\": null}
H2O_internal_security:False
Python_version:3.11.5 final
\n", + "
\n" + ], + "text/plain": [ + "-------------------------- -----------------------------\n", + "H2O_cluster_uptime: 02 secs\n", + "H2O_cluster_timezone: Europe/Prague\n", + "H2O_data_parsing_timezone: UTC\n", + "H2O_cluster_version: 3.44.0.99999\n", + "H2O_cluster_version_age: 17 hours and 36 minutes\n", + "H2O_cluster_name: H2O_from_python_marek_u1r8vv\n", + "H2O_cluster_total_nodes: 1\n", + "H2O_cluster_free_memory: 8 Gb\n", + "H2O_cluster_total_cores: 10\n", + "H2O_cluster_allowed_cores: 10\n", + "H2O_cluster_status: locked, healthy\n", + "H2O_connection_url: http://127.0.0.1:54321\n", + "H2O_connection_proxy: {\"http\": null, \"https\": null}\n", + "H2O_internal_security: False\n", + "Python_version: 3.11.5 final\n", + "-------------------------- -----------------------------" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], "source": [ "# Start H2O-3 runtime.\n", "\n", @@ -15,10 +133,1294 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 2, "id": "5e746ad4", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%\n", + "drf Model Build progress: |██████████████████████████████████████████████████████| (done) 100%\n" + ] + }, + { + "data": { + "text/html": [ + "
Model Details\n",
+       "=============\n",
+       "H2ORandomForestEstimator : Distributed Random Forest\n",
+       "Model Key: DRF_model_python_1699962414622_1\n",
+       "
\n", + "
\n", + " \n", + "
\n", + " \n", + " \n", + " \n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + " \n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "
Model Summary:
number_of_treesnumber_of_internal_treesmodel_size_in_bytesmin_depthmax_depthmean_depthmin_leavesmax_leavesmean_leaves
10.020.03304.04.05.04.457.010.08.55
\n", + "
\n", + "
\n", + "
ModelMetricsBinomial: drf\n",
+       "** Reported on train data. **\n",
+       "\n",
+       "MSE: 0.05640340025564431\n",
+       "RMSE: 0.23749400046242075\n",
+       "LogLoss: 0.2919276910550563\n",
+       "Mean Per-Class Error: 0.07387110016420362\n",
+       "AUC: 0.971572249589491\n",
+       "AUCPR: 0.977918414938428\n",
+       "Gini: 0.9431444991789819
\n", + "
\n", + " \n", + "
\n", + " \n", + " \n", + " \n", + "\n", + "\n", + "\n", + "\n", + " \n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "
Confusion Matrix (Act/Pred) for max f1 @ threshold = 0.5375000078231094
01ErrorRate
0107.013.00.1083 (13.0/120.0)
18.0195.00.0394 (8.0/203.0)
Total115.0208.00.065 (21.0/323.0)
\n", + "
\n", + "
\n", + "
\n", + " \n", + "
\n", + " \n", + " \n", + " \n", + "\n", + "\n", + "\n", + " \n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "
Maximum Metrics: Maximum metrics at their respective thresholds
metricthresholdvalueidx
max f10.53750000.9489051104.0
max f20.26754390.9635666127.0
max f0point50.66964290.951169991.0
max accuracy0.60879870.934984598.0
max precision0.97523220.99038468.0
max recall0.01958651.0166.0
max specificity1.00.99166670.0
max absolute_mcc0.60879870.861031598.0
max min_per_class_accuracy0.66964290.921182391.0
max mean_per_class_accuracy0.60879870.931239798.0
max tns1.0119.00.0
max fns1.0111.00.0
max fps0.0120.0168.0
max tps0.0195865203.0166.0
max tnr1.00.99166670.0
max fnr1.00.54679800.0
max fpr0.01.0168.0
max tpr0.01958651.0166.0
\n", + "
\n", + "
\n", + "
\n", + " \n", + "
\n", + " \n", + " \n", + " \n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + " \n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "
Gains/Lift Table: Avg response rate: 62.77 %, avg score: 63.00 %
groupcumulative_data_fractionlower_thresholdliftcumulative_liftresponse_ratescorecumulative_response_ratecumulative_scorecapture_ratecumulative_capture_rategaincumulative_gainkolmogorov_smirnov
10.28615381.01.57600671.57600670.98924731.00.98924731.00.45098040.450980457.600674757.60067470.4427159
20.31076920.98000001.59313731.57736361.00.98495940.99009900.99880870.03921570.490196159.313725557.73636190.4819316
30.40615380.93642831.54174571.56899880.96774190.96052090.98484850.98981680.14705880.637254954.174573156.89988120.6207260
40.50769230.83241341.44830661.54486040.90909090.89365680.96969700.97058480.14705880.784313744.830659554.48603680.7429914
50.60923080.63958151.35175281.51267580.84848480.76758840.94949490.93675210.13725490.921568635.175282251.26757770.8389240
60.71076920.26534280.62759951.38623630.39393940.42653700.87012990.86386420.06372550.9852941-37.240047538.62363130.7373602
70.81230770.05656070.04827691.21899140.03030300.14591460.76515150.77412050.00490200.9901961-95.172311321.89913840.4777994
81.00.00.05223401.00.03278690.00634670.62769230.63001530.00980391.0-94.77659920.00.0
\n", + "
\n", + "
\n", + "
ModelMetricsBinomial: drf\n",
+       "** Reported on validation data. **\n",
+       "\n",
+       "MSE: 0.07456823875961219\n",
+       "RMSE: 0.27307185640342396\n",
+       "LogLoss: 0.23061493200662153\n",
+       "Mean Per-Class Error: 0.06787330316742082\n",
+       "AUC: 0.9758672699849171\n",
+       "AUCPR: 0.9767025733448528\n",
+       "Gini: 0.9517345399698343
\n", + "
\n", + " \n", + "
\n", + " \n", + " \n", + " \n", + "\n", + "\n", + "\n", + "\n", + " \n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "
Confusion Matrix (Act/Pred) for max f1 @ threshold = 0.7285497819635679
01ErrorRate
036.03.00.0769 (3.0/39.0)
12.032.00.0588 (2.0/34.0)
Total38.035.00.0685 (5.0/73.0)
\n", + "
\n", + "
\n", + "
\n", + " \n", + "
\n", + " \n", + " \n", + " \n", + "\n", + "\n", + "\n", + " \n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "
Maximum Metrics: Maximum metrics at their respective thresholds
metricthresholdvalueidx
max f10.72854980.927536221.0
max f20.47369550.937500026.0
max f0point50.86120130.958904114.0
max accuracy0.72854980.931506821.0
max precision1.01.00.0
max recall0.04007181.035.0
max specificity1.01.00.0
max absolute_mcc0.72854980.862952821.0
max min_per_class_accuracy0.72854980.923076921.0
max mean_per_class_accuracy0.72854980.932126721.0
max tns1.039.00.0
max fns1.023.00.0
max fps0.039.040.0
max tps0.040071834.035.0
max tnr1.01.00.0
max fnr1.00.67647060.0
max fpr0.01.040.0
max tpr0.04007181.035.0
\n", + "
\n", + "
\n", + "
\n", + " \n", + "
\n", + " \n", + " \n", + " \n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + " \n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "
Gains/Lift Table: Avg response rate: 46.58 %, avg score: 51.37 %
groupcumulative_data_fractionlower_thresholdliftcumulative_liftresponse_ratescorecumulative_response_ratecumulative_scorecapture_ratecumulative_capture_rategaincumulative_gainkolmogorov_smirnov
10.15068491.02.14705882.14705881.01.01.01.00.32352940.3235294114.7058824114.70588240.3235294
20.17808220.99500002.14705882.14705881.00.99500001.00.99923080.05882350.3823529114.7058824114.70588240.3823529
30.21917810.99000002.14705882.14705881.00.99165621.00.99781050.08823530.4705882114.7058824114.70588240.4705882
40.31506850.94979472.14705882.14705881.00.97376441.00.99049220.20588240.6764706114.7058824114.70588240.6764706
50.42465750.80156301.61029412.00853890.750.88171030.93548390.96241940.17647060.852941261.0294118100.85388990.8016591
60.52054790.54541290.92016811.80804950.42857140.71652080.84210530.91712230.08823530.9411765-7.983193380.80495360.7873303
70.63013700.09829370.26838241.54028130.1250.29241210.71739130.80847700.02941180.9705882-73.161764754.02813300.6372549
80.75342470.01175700.23856211.32727270.11111110.03326520.61818180.68162420.02941181.0-76.143790832.72727270.4615385
91.00.00.01.00.00.00046300.46575340.51366660.01.0-100.00.00.0
\n", + "
\n", + "
\n", + "
\n", + " \n", + "
\n", + " \n", + " \n", + " \n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + " \n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "
Scoring History:
timestampdurationnumber_of_treestraining_rmsetraining_loglosstraining_auctraining_pr_auctraining_lifttraining_classification_errorvalidation_rmsevalidation_loglossvalidation_aucvalidation_pr_aucvalidation_liftvalidation_classification_error
2023-11-14 12:47:00 0.015 sec0.0nannannannannannannannannannannannan
2023-11-14 12:47:00 0.101 sec1.00.23112010.44951080.97635580.98801771.59313730.05263160.30042961.12361910.94042230.92519542.06447960.1232877
2023-11-14 12:47:00 0.122 sec2.00.25806430.38994100.96637000.98240931.59313730.07526880.27914790.23563950.96794870.96771822.14705880.0958904
2023-11-14 12:47:00 0.139 sec3.00.25253040.21337740.96891100.98135411.59313730.06639000.27156760.22449520.97511310.97500942.14705880.0821918
2023-11-14 12:47:00 0.152 sec4.00.26260930.56675750.95421600.96349651.55887620.07773850.27436160.23157390.97549020.97549272.14705880.0684932
2023-11-14 12:47:00 0.173 sec5.00.25994780.54581260.95621100.96380481.55850380.07457630.27878540.24058790.97435900.97402952.14705880.0684932
2023-11-14 12:47:00 0.187 sec6.00.24823680.30711230.96707250.97530641.57654210.07096770.27557430.23291750.97662140.97632532.14705880.0684932
2023-11-14 12:47:00 0.197 sec7.00.24377850.30005010.96910550.97691131.57671320.07028750.27862340.23910730.97662140.97654432.14705880.0684932
2023-11-14 12:47:00 0.209 sec8.00.24116150.29758880.97071490.97727831.57600670.06603770.27928260.24390570.97586730.97628272.14705880.0684932
2023-11-14 12:47:00 0.223 sec9.00.24428650.30192640.96886180.97632121.57582050.06853580.27504320.23347050.97586730.97628272.14705880.0684932
2023-11-14 12:47:00 0.233 sec10.00.23749400.29192770.97157220.97791841.57600670.06501550.27307190.23061490.97586730.97670262.14705880.0684932
\n", + "
\n", + "
\n", + "
\n", + " \n", + "
\n", + " \n", + " \n", + " \n", + "\n", + "\n", + "\n", + " \n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "
Variable Importances:
variablerelative_importancescaled_importancepercentage
displacement626.20294191.00.5767827
weight322.55953980.51510380.2971030
power98.11605070.15668410.0903727
year36.42194750.05816320.0335475
acceleration2.38212900.00380410.0021941
\n", + "
\n", + "
\n",
+       "\n",
+       "[tips]\n",
+       "Use `model.explain()` to inspect the model.\n",
+       "--\n",
+       "Use `h2o.display.toggle_user_tips()` to switch on/off this section.
" + ], + "text/plain": [ + "Model Details\n", + "=============\n", + "H2ORandomForestEstimator : Distributed Random Forest\n", + "Model Key: DRF_model_python_1699962414622_1\n", + "\n", + "\n", + "Model Summary: \n", + " number_of_trees number_of_internal_trees model_size_in_bytes min_depth max_depth mean_depth min_leaves max_leaves mean_leaves\n", + "-- ----------------- -------------------------- --------------------- ----------- ----------- ------------ ------------ ------------ -------------\n", + " 10 20 3304 4 5 4.45 7 10 8.55\n", + "\n", + "ModelMetricsBinomial: drf\n", + "** Reported on train data. **\n", + "\n", + "MSE: 0.05640340025564431\n", + "RMSE: 0.23749400046242075\n", + "LogLoss: 0.2919276910550563\n", + "Mean Per-Class Error: 0.07387110016420362\n", + "AUC: 0.971572249589491\n", + "AUCPR: 0.977918414938428\n", + "Gini: 0.9431444991789819\n", + "\n", + "Confusion Matrix (Act/Pred) for max f1 @ threshold = 0.5375000078231094\n", + " 0 1 Error Rate\n", + "----- --- --- ------- ------------\n", + "0 107 13 0.1083 (13.0/120.0)\n", + "1 8 195 0.0394 (8.0/203.0)\n", + "Total 115 208 0.065 (21.0/323.0)\n", + "\n", + "Maximum Metrics: Maximum metrics at their respective thresholds\n", + "metric threshold value idx\n", + "--------------------------- ----------- -------- -----\n", + "max f1 0.5375 0.948905 104\n", + "max f2 0.267544 0.963567 127\n", + "max f0point5 0.669643 0.95117 91\n", + "max accuracy 0.608799 0.934985 98\n", + "max precision 0.975232 0.990385 8\n", + "max recall 0.0195865 1 166\n", + "max specificity 1 0.991667 0\n", + "max absolute_mcc 0.608799 0.861032 98\n", + "max min_per_class_accuracy 0.669643 0.921182 91\n", + "max mean_per_class_accuracy 0.608799 0.93124 98\n", + "max tns 1 119 0\n", + "max fns 1 111 0\n", + "max fps 0 120 168\n", + "max tps 0.0195865 203 166\n", + "max tnr 1 0.991667 0\n", + "max fnr 1 0.546798 0\n", + "max fpr 0 1 168\n", + "max tpr 0.0195865 1 166\n", + "\n", + "Gains/Lift Table: Avg response rate: 62.77 %, avg score: 63.00 %\n", + "group cumulative_data_fraction lower_threshold lift cumulative_lift response_rate score cumulative_response_rate cumulative_score capture_rate cumulative_capture_rate gain cumulative_gain kolmogorov_smirnov\n", + "------- -------------------------- ----------------- --------- ----------------- --------------- ---------- -------------------------- ------------------ -------------- ------------------------- -------- ----------------- --------------------\n", + "1 0.286154 1 1.57601 1.57601 0.989247 1 0.989247 1 0.45098 0.45098 57.6007 57.6007 0.442716\n", + "2 0.310769 0.98 1.59314 1.57736 1 0.984959 0.990099 0.998809 0.0392157 0.490196 59.3137 57.7364 0.481932\n", + "3 0.406154 0.936428 1.54175 1.569 0.967742 0.960521 0.984848 0.989817 0.147059 0.637255 54.1746 56.8999 0.620726\n", + "4 0.507692 0.832413 1.44831 1.54486 0.909091 0.893657 0.969697 0.970585 0.147059 0.784314 44.8307 54.486 0.742991\n", + "5 0.609231 0.639581 1.35175 1.51268 0.848485 0.767588 0.949495 0.936752 0.137255 0.921569 35.1753 51.2676 0.838924\n", + "6 0.710769 0.265343 0.6276 1.38624 0.393939 0.426537 0.87013 0.863864 0.0637255 0.985294 -37.24 38.6236 0.73736\n", + "7 0.812308 0.0565607 0.0482769 1.21899 0.030303 0.145915 0.765152 0.774121 0.00490196 0.990196 -95.1723 21.8991 0.477799\n", + "8 1 0 0.052234 1 0.0327869 0.00634675 0.627692 0.630015 0.00980392 1 -94.7766 0 0\n", + "\n", + "ModelMetricsBinomial: drf\n", + "** Reported on validation data. **\n", + "\n", + "MSE: 0.07456823875961219\n", + "RMSE: 0.27307185640342396\n", + "LogLoss: 0.23061493200662153\n", + "Mean Per-Class Error: 0.06787330316742082\n", + "AUC: 0.9758672699849171\n", + "AUCPR: 0.9767025733448528\n", + "Gini: 0.9517345399698343\n", + "\n", + "Confusion Matrix (Act/Pred) for max f1 @ threshold = 0.7285497819635679\n", + " 0 1 Error Rate\n", + "----- --- --- ------- ----------\n", + "0 36 3 0.0769 (3.0/39.0)\n", + "1 2 32 0.0588 (2.0/34.0)\n", + "Total 38 35 0.0685 (5.0/73.0)\n", + "\n", + "Maximum Metrics: Maximum metrics at their respective thresholds\n", + "metric threshold value idx\n", + "--------------------------- ----------- -------- -----\n", + "max f1 0.72855 0.927536 21\n", + "max f2 0.473696 0.9375 26\n", + "max f0point5 0.861201 0.958904 14\n", + "max accuracy 0.72855 0.931507 21\n", + "max precision 1 1 0\n", + "max recall 0.0400718 1 35\n", + "max specificity 1 1 0\n", + "max absolute_mcc 0.72855 0.862953 21\n", + "max min_per_class_accuracy 0.72855 0.923077 21\n", + "max mean_per_class_accuracy 0.72855 0.932127 21\n", + "max tns 1 39 0\n", + "max fns 1 23 0\n", + "max fps 0 39 40\n", + "max tps 0.0400718 34 35\n", + "max tnr 1 1 0\n", + "max fnr 1 0.676471 0\n", + "max fpr 0 1 40\n", + "max tpr 0.0400718 1 35\n", + "\n", + "Gains/Lift Table: Avg response rate: 46.58 %, avg score: 51.37 %\n", + "group cumulative_data_fraction lower_threshold lift cumulative_lift response_rate score cumulative_response_rate cumulative_score capture_rate cumulative_capture_rate gain cumulative_gain kolmogorov_smirnov\n", + "------- -------------------------- ----------------- -------- ----------------- --------------- ----------- -------------------------- ------------------ -------------- ------------------------- -------- ----------------- --------------------\n", + "1 0.150685 1 2.14706 2.14706 1 1 1 1 0.323529 0.323529 114.706 114.706 0.323529\n", + "2 0.178082 0.995 2.14706 2.14706 1 0.995 1 0.999231 0.0588235 0.382353 114.706 114.706 0.382353\n", + "3 0.219178 0.99 2.14706 2.14706 1 0.991656 1 0.997811 0.0882353 0.470588 114.706 114.706 0.470588\n", + "4 0.315068 0.949795 2.14706 2.14706 1 0.973764 1 0.990492 0.205882 0.676471 114.706 114.706 0.676471\n", + "5 0.424658 0.801563 1.61029 2.00854 0.75 0.88171 0.935484 0.962419 0.176471 0.852941 61.0294 100.854 0.801659\n", + "6 0.520548 0.545413 0.920168 1.80805 0.428571 0.716521 0.842105 0.917122 0.0882353 0.941176 -7.98319 80.805 0.78733\n", + "7 0.630137 0.0982937 0.268382 1.54028 0.125 0.292412 0.717391 0.808477 0.0294118 0.970588 -73.1618 54.0281 0.637255\n", + "8 0.753425 0.011757 0.238562 1.32727 0.111111 0.0332652 0.618182 0.681624 0.0294118 1 -76.1438 32.7273 0.461538\n", + "9 1 0 0 1 0 0.000462963 0.465753 0.513667 0 1 -100 0 0\n", + "\n", + "Scoring History: \n", + " timestamp duration number_of_trees training_rmse training_logloss training_auc training_pr_auc training_lift training_classification_error validation_rmse validation_logloss validation_auc validation_pr_auc validation_lift validation_classification_error\n", + "-- ------------------- ---------- ----------------- --------------- ------------------ -------------- ----------------- --------------- ------------------------------- ----------------- -------------------- ---------------- ------------------- ----------------- ---------------------------------\n", + " 2023-11-14 12:47:00 0.015 sec 0 nan nan nan nan nan nan nan nan nan nan nan nan\n", + " 2023-11-14 12:47:00 0.101 sec 1 0.23112 0.449511 0.976356 0.988018 1.59314 0.0526316 0.30043 1.12362 0.940422 0.925195 2.06448 0.123288\n", + " 2023-11-14 12:47:00 0.122 sec 2 0.258064 0.389941 0.96637 0.982409 1.59314 0.0752688 0.279148 0.23564 0.967949 0.967718 2.14706 0.0958904\n", + " 2023-11-14 12:47:00 0.139 sec 3 0.25253 0.213377 0.968911 0.981354 1.59314 0.06639 0.271568 0.224495 0.975113 0.975009 2.14706 0.0821918\n", + " 2023-11-14 12:47:00 0.152 sec 4 0.262609 0.566757 0.954216 0.963496 1.55888 0.0777385 0.274362 0.231574 0.97549 0.975493 2.14706 0.0684932\n", + " 2023-11-14 12:47:00 0.173 sec 5 0.259948 0.545813 0.956211 0.963805 1.5585 0.0745763 0.278785 0.240588 0.974359 0.97403 2.14706 0.0684932\n", + " 2023-11-14 12:47:00 0.187 sec 6 0.248237 0.307112 0.967073 0.975306 1.57654 0.0709677 0.275574 0.232917 0.976621 0.976325 2.14706 0.0684932\n", + " 2023-11-14 12:47:00 0.197 sec 7 0.243779 0.30005 0.969106 0.976911 1.57671 0.0702875 0.278623 0.239107 0.976621 0.976544 2.14706 0.0684932\n", + " 2023-11-14 12:47:00 0.209 sec 8 0.241162 0.297589 0.970715 0.977278 1.57601 0.0660377 0.279283 0.243906 0.975867 0.976283 2.14706 0.0684932\n", + " 2023-11-14 12:47:00 0.223 sec 9 0.244287 0.301926 0.968862 0.976321 1.57582 0.0685358 0.275043 0.23347 0.975867 0.976283 2.14706 0.0684932\n", + " 2023-11-14 12:47:00 0.233 sec 10 0.237494 0.291928 0.971572 0.977918 1.57601 0.0650155 0.273072 0.230615 0.975867 0.976703 2.14706 0.0684932\n", + "\n", + "Variable Importances: \n", + "variable relative_importance scaled_importance percentage\n", + "------------ --------------------- ------------------- ------------\n", + "displacement 626.203 1 0.576783\n", + "weight 322.56 0.515104 0.297103\n", + "power 98.1161 0.156684 0.0903727\n", + "year 36.4219 0.0581632 0.0335475\n", + "acceleration 2.38213 0.00380408 0.00219413\n", + "\n", + "[tips]\n", + "Use `model.explain()` to inspect the model.\n", + "--\n", + "Use `h2o.display.toggle_user_tips()` to switch on/off this section." + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "# Configure DRF algorithm and train a model.\n", "\n", @@ -49,10 +1451,19 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 3, "id": "29eb0722", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/marek/opt/anaconda3/envs/mlflow/lib/python3.11/site-packages/_distutils_hack/__init__.py:33: UserWarning: Setuptools is replacing distutils.\n", + " warnings.warn(\"Setuptools is replacing distutils.\")\n" + ] + } + ], "source": [ "# Log the model to an MLFlow reqistry.\n", "\n", @@ -71,10 +1482,154 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 4, "id": "bed1dafe", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
predictp0p1cal_p0cal_p1
000.9900000.0100000.9936180.006382
101.0000000.0000000.9941000.005900
200.9718180.0281820.9926390.007361
300.5139320.4860680.7834780.216522
400.6410560.3589440.9080930.091907
..................
40100.3402440.6597560.4784110.521589
40200.2319990.7680010.2805460.719454
40300.4565240.5434760.6968670.303133
40400.4174570.5825430.6280170.371983
40510.1966310.8033690.2277210.772279
\n", + "

406 rows × 5 columns

\n", + "
" + ], + "text/plain": [ + " predict p0 p1 cal_p0 cal_p1\n", + "0 0 0.990000 0.010000 0.993618 0.006382\n", + "1 0 1.000000 0.000000 0.994100 0.005900\n", + "2 0 0.971818 0.028182 0.992639 0.007361\n", + "3 0 0.513932 0.486068 0.783478 0.216522\n", + "4 0 0.641056 0.358944 0.908093 0.091907\n", + ".. ... ... ... ... ...\n", + "401 0 0.340244 0.659756 0.478411 0.521589\n", + "402 0 0.231999 0.768001 0.280546 0.719454\n", + "403 0 0.456524 0.543476 0.696867 0.303133\n", + "404 0 0.417457 0.582543 0.628017 0.371983\n", + "405 1 0.196631 0.803369 0.227721 0.772279\n", + "\n", + "[406 rows x 5 columns]" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "# Load model from the MLFlow registry and score with the model.\n", "\n", diff --git a/h2o-py-mlflow-flavor/setup.py b/h2o-py-mlflow-flavor/setup.py index 5730e35ed4e7..2335b221fda8 100644 --- a/h2o-py-mlflow-flavor/setup.py +++ b/h2o-py-mlflow-flavor/setup.py @@ -6,7 +6,7 @@ here = os.path.abspath(os.path.dirname(__file__)) # Get the long description from the relevant file -with open(os.path.join(here, 'DESCRIPTION.rst'), encoding='utf-8') as f: +with open(os.path.join(here, 'README.rst'), encoding='utf-8') as f: long_description = f.read() version = "0.1.0" From 237069bf836ae1087e2c51bf407c48a6b5d5698f Mon Sep 17 00:00:00 2001 From: Marek Novotny Date: Tue, 14 Nov 2023 17:02:34 +0100 Subject: [PATCH 43/43] Revert DRF_mojo.ipynb --- h2o-py-mlflow-flavor/examples/DRF_mojo.ipynb | 1571 +----------------- 1 file changed, 8 insertions(+), 1563 deletions(-) diff --git a/h2o-py-mlflow-flavor/examples/DRF_mojo.ipynb b/h2o-py-mlflow-flavor/examples/DRF_mojo.ipynb index b1d825a9a798..7327c94f9a0d 100644 --- a/h2o-py-mlflow-flavor/examples/DRF_mojo.ipynb +++ b/h2o-py-mlflow-flavor/examples/DRF_mojo.ipynb @@ -2,128 +2,10 @@ "cells": [ { "cell_type": "code", - "execution_count": 1, + "execution_count": null, "id": "3ded5553", "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Checking whether there is an H2O instance running at http://localhost:54321..... not found.\n", - "Attempting to start a local H2O server...\n", - " Java Version: openjdk version \"11.0.21\" 2023-10-17; OpenJDK Runtime Environment Homebrew (build 11.0.21+0); OpenJDK 64-Bit Server VM Homebrew (build 11.0.21+0, mixed mode)\n", - " Starting server from /Users/marek/git2/h2o-3/build/h2o.jar\n", - " Ice root: /var/folders/bz/gzkngzwj2593j90gdmscv89c0000gn/T/tmpqhbfsn1i\n", - " JVM stdout: /var/folders/bz/gzkngzwj2593j90gdmscv89c0000gn/T/tmpqhbfsn1i/h2o_marek_started_from_python.out\n", - " JVM stderr: /var/folders/bz/gzkngzwj2593j90gdmscv89c0000gn/T/tmpqhbfsn1i/h2o_marek_started_from_python.err\n", - " Server is running at http://127.0.0.1:54321\n", - "Connecting to H2O server at http://127.0.0.1:54321 ... successful.\n", - "Warning: Version mismatch. H2O is version 3.44.0.99999, but the h2o-python package is version 3.44.0.1. This is a developer build, please contact your developer. To avoid this error message (not recommended), you can set strict_version_check=False.\n" - ] - }, - { - "data": { - "text/html": [ - "\n", - " \n", - "
\n", - " \n", - " \n", - " \n", - " \n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "
H2O_cluster_uptime:02 secs
H2O_cluster_timezone:Europe/Prague
H2O_data_parsing_timezone:UTC
H2O_cluster_version:3.44.0.99999
H2O_cluster_version_age:17 hours and 36 minutes
H2O_cluster_name:H2O_from_python_marek_u1r8vv
H2O_cluster_total_nodes:1
H2O_cluster_free_memory:8 Gb
H2O_cluster_total_cores:10
H2O_cluster_allowed_cores:10
H2O_cluster_status:locked, healthy
H2O_connection_url:http://127.0.0.1:54321
H2O_connection_proxy:{\"http\": null, \"https\": null}
H2O_internal_security:False
Python_version:3.11.5 final
\n", - "
\n" - ], - "text/plain": [ - "-------------------------- -----------------------------\n", - "H2O_cluster_uptime: 02 secs\n", - "H2O_cluster_timezone: Europe/Prague\n", - "H2O_data_parsing_timezone: UTC\n", - "H2O_cluster_version: 3.44.0.99999\n", - "H2O_cluster_version_age: 17 hours and 36 minutes\n", - "H2O_cluster_name: H2O_from_python_marek_u1r8vv\n", - "H2O_cluster_total_nodes: 1\n", - "H2O_cluster_free_memory: 8 Gb\n", - "H2O_cluster_total_cores: 10\n", - "H2O_cluster_allowed_cores: 10\n", - "H2O_cluster_status: locked, healthy\n", - "H2O_connection_url: http://127.0.0.1:54321\n", - "H2O_connection_proxy: {\"http\": null, \"https\": null}\n", - "H2O_internal_security: False\n", - "Python_version: 3.11.5 final\n", - "-------------------------- -----------------------------" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], + "outputs": [], "source": [ "# Start H2O-3 runtime.\n", "\n", @@ -133,1294 +15,10 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": null, "id": "5e746ad4", "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%\n", - "drf Model Build progress: |██████████████████████████████████████████████████████| (done) 100%\n" - ] - }, - { - "data": { - "text/html": [ - "
Model Details\n",
-       "=============\n",
-       "H2ORandomForestEstimator : Distributed Random Forest\n",
-       "Model Key: DRF_model_python_1699962414622_1\n",
-       "
\n", - "
\n", - " \n", - "
\n", - " \n", - " \n", - " \n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - " \n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "
Model Summary:
number_of_treesnumber_of_internal_treesmodel_size_in_bytesmin_depthmax_depthmean_depthmin_leavesmax_leavesmean_leaves
10.020.03304.04.05.04.457.010.08.55
\n", - "
\n", - "
\n", - "
ModelMetricsBinomial: drf\n",
-       "** Reported on train data. **\n",
-       "\n",
-       "MSE: 0.05640340025564431\n",
-       "RMSE: 0.23749400046242075\n",
-       "LogLoss: 0.2919276910550563\n",
-       "Mean Per-Class Error: 0.07387110016420362\n",
-       "AUC: 0.971572249589491\n",
-       "AUCPR: 0.977918414938428\n",
-       "Gini: 0.9431444991789819
\n", - "
\n", - " \n", - "
\n", - " \n", - " \n", - " \n", - "\n", - "\n", - "\n", - "\n", - " \n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "
Confusion Matrix (Act/Pred) for max f1 @ threshold = 0.5375000078231094
01ErrorRate
0107.013.00.1083 (13.0/120.0)
18.0195.00.0394 (8.0/203.0)
Total115.0208.00.065 (21.0/323.0)
\n", - "
\n", - "
\n", - "
\n", - " \n", - "
\n", - " \n", - " \n", - " \n", - "\n", - "\n", - "\n", - " \n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "
Maximum Metrics: Maximum metrics at their respective thresholds
metricthresholdvalueidx
max f10.53750000.9489051104.0
max f20.26754390.9635666127.0
max f0point50.66964290.951169991.0
max accuracy0.60879870.934984598.0
max precision0.97523220.99038468.0
max recall0.01958651.0166.0
max specificity1.00.99166670.0
max absolute_mcc0.60879870.861031598.0
max min_per_class_accuracy0.66964290.921182391.0
max mean_per_class_accuracy0.60879870.931239798.0
max tns1.0119.00.0
max fns1.0111.00.0
max fps0.0120.0168.0
max tps0.0195865203.0166.0
max tnr1.00.99166670.0
max fnr1.00.54679800.0
max fpr0.01.0168.0
max tpr0.01958651.0166.0
\n", - "
\n", - "
\n", - "
\n", - " \n", - "
\n", - " \n", - " \n", - " \n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - " \n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "
Gains/Lift Table: Avg response rate: 62.77 %, avg score: 63.00 %
groupcumulative_data_fractionlower_thresholdliftcumulative_liftresponse_ratescorecumulative_response_ratecumulative_scorecapture_ratecumulative_capture_rategaincumulative_gainkolmogorov_smirnov
10.28615381.01.57600671.57600670.98924731.00.98924731.00.45098040.450980457.600674757.60067470.4427159
20.31076920.98000001.59313731.57736361.00.98495940.99009900.99880870.03921570.490196159.313725557.73636190.4819316
30.40615380.93642831.54174571.56899880.96774190.96052090.98484850.98981680.14705880.637254954.174573156.89988120.6207260
40.50769230.83241341.44830661.54486040.90909090.89365680.96969700.97058480.14705880.784313744.830659554.48603680.7429914
50.60923080.63958151.35175281.51267580.84848480.76758840.94949490.93675210.13725490.921568635.175282251.26757770.8389240
60.71076920.26534280.62759951.38623630.39393940.42653700.87012990.86386420.06372550.9852941-37.240047538.62363130.7373602
70.81230770.05656070.04827691.21899140.03030300.14591460.76515150.77412050.00490200.9901961-95.172311321.89913840.4777994
81.00.00.05223401.00.03278690.00634670.62769230.63001530.00980391.0-94.77659920.00.0
\n", - "
\n", - "
\n", - "
ModelMetricsBinomial: drf\n",
-       "** Reported on validation data. **\n",
-       "\n",
-       "MSE: 0.07456823875961219\n",
-       "RMSE: 0.27307185640342396\n",
-       "LogLoss: 0.23061493200662153\n",
-       "Mean Per-Class Error: 0.06787330316742082\n",
-       "AUC: 0.9758672699849171\n",
-       "AUCPR: 0.9767025733448528\n",
-       "Gini: 0.9517345399698343
\n", - "
\n", - " \n", - "
\n", - " \n", - " \n", - " \n", - "\n", - "\n", - "\n", - "\n", - " \n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "
Confusion Matrix (Act/Pred) for max f1 @ threshold = 0.7285497819635679
01ErrorRate
036.03.00.0769 (3.0/39.0)
12.032.00.0588 (2.0/34.0)
Total38.035.00.0685 (5.0/73.0)
\n", - "
\n", - "
\n", - "
\n", - " \n", - "
\n", - " \n", - " \n", - " \n", - "\n", - "\n", - "\n", - " \n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "
Maximum Metrics: Maximum metrics at their respective thresholds
metricthresholdvalueidx
max f10.72854980.927536221.0
max f20.47369550.937500026.0
max f0point50.86120130.958904114.0
max accuracy0.72854980.931506821.0
max precision1.01.00.0
max recall0.04007181.035.0
max specificity1.01.00.0
max absolute_mcc0.72854980.862952821.0
max min_per_class_accuracy0.72854980.923076921.0
max mean_per_class_accuracy0.72854980.932126721.0
max tns1.039.00.0
max fns1.023.00.0
max fps0.039.040.0
max tps0.040071834.035.0
max tnr1.01.00.0
max fnr1.00.67647060.0
max fpr0.01.040.0
max tpr0.04007181.035.0
\n", - "
\n", - "
\n", - "
\n", - " \n", - "
\n", - " \n", - " \n", - " \n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - " \n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "
Gains/Lift Table: Avg response rate: 46.58 %, avg score: 51.37 %
groupcumulative_data_fractionlower_thresholdliftcumulative_liftresponse_ratescorecumulative_response_ratecumulative_scorecapture_ratecumulative_capture_rategaincumulative_gainkolmogorov_smirnov
10.15068491.02.14705882.14705881.01.01.01.00.32352940.3235294114.7058824114.70588240.3235294
20.17808220.99500002.14705882.14705881.00.99500001.00.99923080.05882350.3823529114.7058824114.70588240.3823529
30.21917810.99000002.14705882.14705881.00.99165621.00.99781050.08823530.4705882114.7058824114.70588240.4705882
40.31506850.94979472.14705882.14705881.00.97376441.00.99049220.20588240.6764706114.7058824114.70588240.6764706
50.42465750.80156301.61029412.00853890.750.88171030.93548390.96241940.17647060.852941261.0294118100.85388990.8016591
60.52054790.54541290.92016811.80804950.42857140.71652080.84210530.91712230.08823530.9411765-7.983193380.80495360.7873303
70.63013700.09829370.26838241.54028130.1250.29241210.71739130.80847700.02941180.9705882-73.161764754.02813300.6372549
80.75342470.01175700.23856211.32727270.11111110.03326520.61818180.68162420.02941181.0-76.143790832.72727270.4615385
91.00.00.01.00.00.00046300.46575340.51366660.01.0-100.00.00.0
\n", - "
\n", - "
\n", - "
\n", - " \n", - "
\n", - " \n", - " \n", - " \n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - " \n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "
Scoring History:
timestampdurationnumber_of_treestraining_rmsetraining_loglosstraining_auctraining_pr_auctraining_lifttraining_classification_errorvalidation_rmsevalidation_loglossvalidation_aucvalidation_pr_aucvalidation_liftvalidation_classification_error
2023-11-14 12:47:00 0.015 sec0.0nannannannannannannannannannannannan
2023-11-14 12:47:00 0.101 sec1.00.23112010.44951080.97635580.98801771.59313730.05263160.30042961.12361910.94042230.92519542.06447960.1232877
2023-11-14 12:47:00 0.122 sec2.00.25806430.38994100.96637000.98240931.59313730.07526880.27914790.23563950.96794870.96771822.14705880.0958904
2023-11-14 12:47:00 0.139 sec3.00.25253040.21337740.96891100.98135411.59313730.06639000.27156760.22449520.97511310.97500942.14705880.0821918
2023-11-14 12:47:00 0.152 sec4.00.26260930.56675750.95421600.96349651.55887620.07773850.27436160.23157390.97549020.97549272.14705880.0684932
2023-11-14 12:47:00 0.173 sec5.00.25994780.54581260.95621100.96380481.55850380.07457630.27878540.24058790.97435900.97402952.14705880.0684932
2023-11-14 12:47:00 0.187 sec6.00.24823680.30711230.96707250.97530641.57654210.07096770.27557430.23291750.97662140.97632532.14705880.0684932
2023-11-14 12:47:00 0.197 sec7.00.24377850.30005010.96910550.97691131.57671320.07028750.27862340.23910730.97662140.97654432.14705880.0684932
2023-11-14 12:47:00 0.209 sec8.00.24116150.29758880.97071490.97727831.57600670.06603770.27928260.24390570.97586730.97628272.14705880.0684932
2023-11-14 12:47:00 0.223 sec9.00.24428650.30192640.96886180.97632121.57582050.06853580.27504320.23347050.97586730.97628272.14705880.0684932
2023-11-14 12:47:00 0.233 sec10.00.23749400.29192770.97157220.97791841.57600670.06501550.27307190.23061490.97586730.97670262.14705880.0684932
\n", - "
\n", - "
\n", - "
\n", - " \n", - "
\n", - " \n", - " \n", - " \n", - "\n", - "\n", - "\n", - " \n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "
Variable Importances:
variablerelative_importancescaled_importancepercentage
displacement626.20294191.00.5767827
weight322.55953980.51510380.2971030
power98.11605070.15668410.0903727
year36.42194750.05816320.0335475
acceleration2.38212900.00380410.0021941
\n", - "
\n", - "
\n",
-       "\n",
-       "[tips]\n",
-       "Use `model.explain()` to inspect the model.\n",
-       "--\n",
-       "Use `h2o.display.toggle_user_tips()` to switch on/off this section.
" - ], - "text/plain": [ - "Model Details\n", - "=============\n", - "H2ORandomForestEstimator : Distributed Random Forest\n", - "Model Key: DRF_model_python_1699962414622_1\n", - "\n", - "\n", - "Model Summary: \n", - " number_of_trees number_of_internal_trees model_size_in_bytes min_depth max_depth mean_depth min_leaves max_leaves mean_leaves\n", - "-- ----------------- -------------------------- --------------------- ----------- ----------- ------------ ------------ ------------ -------------\n", - " 10 20 3304 4 5 4.45 7 10 8.55\n", - "\n", - "ModelMetricsBinomial: drf\n", - "** Reported on train data. **\n", - "\n", - "MSE: 0.05640340025564431\n", - "RMSE: 0.23749400046242075\n", - "LogLoss: 0.2919276910550563\n", - "Mean Per-Class Error: 0.07387110016420362\n", - "AUC: 0.971572249589491\n", - "AUCPR: 0.977918414938428\n", - "Gini: 0.9431444991789819\n", - "\n", - "Confusion Matrix (Act/Pred) for max f1 @ threshold = 0.5375000078231094\n", - " 0 1 Error Rate\n", - "----- --- --- ------- ------------\n", - "0 107 13 0.1083 (13.0/120.0)\n", - "1 8 195 0.0394 (8.0/203.0)\n", - "Total 115 208 0.065 (21.0/323.0)\n", - "\n", - "Maximum Metrics: Maximum metrics at their respective thresholds\n", - "metric threshold value idx\n", - "--------------------------- ----------- -------- -----\n", - "max f1 0.5375 0.948905 104\n", - "max f2 0.267544 0.963567 127\n", - "max f0point5 0.669643 0.95117 91\n", - "max accuracy 0.608799 0.934985 98\n", - "max precision 0.975232 0.990385 8\n", - "max recall 0.0195865 1 166\n", - "max specificity 1 0.991667 0\n", - "max absolute_mcc 0.608799 0.861032 98\n", - "max min_per_class_accuracy 0.669643 0.921182 91\n", - "max mean_per_class_accuracy 0.608799 0.93124 98\n", - "max tns 1 119 0\n", - "max fns 1 111 0\n", - "max fps 0 120 168\n", - "max tps 0.0195865 203 166\n", - "max tnr 1 0.991667 0\n", - "max fnr 1 0.546798 0\n", - "max fpr 0 1 168\n", - "max tpr 0.0195865 1 166\n", - "\n", - "Gains/Lift Table: Avg response rate: 62.77 %, avg score: 63.00 %\n", - "group cumulative_data_fraction lower_threshold lift cumulative_lift response_rate score cumulative_response_rate cumulative_score capture_rate cumulative_capture_rate gain cumulative_gain kolmogorov_smirnov\n", - "------- -------------------------- ----------------- --------- ----------------- --------------- ---------- -------------------------- ------------------ -------------- ------------------------- -------- ----------------- --------------------\n", - "1 0.286154 1 1.57601 1.57601 0.989247 1 0.989247 1 0.45098 0.45098 57.6007 57.6007 0.442716\n", - "2 0.310769 0.98 1.59314 1.57736 1 0.984959 0.990099 0.998809 0.0392157 0.490196 59.3137 57.7364 0.481932\n", - "3 0.406154 0.936428 1.54175 1.569 0.967742 0.960521 0.984848 0.989817 0.147059 0.637255 54.1746 56.8999 0.620726\n", - "4 0.507692 0.832413 1.44831 1.54486 0.909091 0.893657 0.969697 0.970585 0.147059 0.784314 44.8307 54.486 0.742991\n", - "5 0.609231 0.639581 1.35175 1.51268 0.848485 0.767588 0.949495 0.936752 0.137255 0.921569 35.1753 51.2676 0.838924\n", - "6 0.710769 0.265343 0.6276 1.38624 0.393939 0.426537 0.87013 0.863864 0.0637255 0.985294 -37.24 38.6236 0.73736\n", - "7 0.812308 0.0565607 0.0482769 1.21899 0.030303 0.145915 0.765152 0.774121 0.00490196 0.990196 -95.1723 21.8991 0.477799\n", - "8 1 0 0.052234 1 0.0327869 0.00634675 0.627692 0.630015 0.00980392 1 -94.7766 0 0\n", - "\n", - "ModelMetricsBinomial: drf\n", - "** Reported on validation data. **\n", - "\n", - "MSE: 0.07456823875961219\n", - "RMSE: 0.27307185640342396\n", - "LogLoss: 0.23061493200662153\n", - "Mean Per-Class Error: 0.06787330316742082\n", - "AUC: 0.9758672699849171\n", - "AUCPR: 0.9767025733448528\n", - "Gini: 0.9517345399698343\n", - "\n", - "Confusion Matrix (Act/Pred) for max f1 @ threshold = 0.7285497819635679\n", - " 0 1 Error Rate\n", - "----- --- --- ------- ----------\n", - "0 36 3 0.0769 (3.0/39.0)\n", - "1 2 32 0.0588 (2.0/34.0)\n", - "Total 38 35 0.0685 (5.0/73.0)\n", - "\n", - "Maximum Metrics: Maximum metrics at their respective thresholds\n", - "metric threshold value idx\n", - "--------------------------- ----------- -------- -----\n", - "max f1 0.72855 0.927536 21\n", - "max f2 0.473696 0.9375 26\n", - "max f0point5 0.861201 0.958904 14\n", - "max accuracy 0.72855 0.931507 21\n", - "max precision 1 1 0\n", - "max recall 0.0400718 1 35\n", - "max specificity 1 1 0\n", - "max absolute_mcc 0.72855 0.862953 21\n", - "max min_per_class_accuracy 0.72855 0.923077 21\n", - "max mean_per_class_accuracy 0.72855 0.932127 21\n", - "max tns 1 39 0\n", - "max fns 1 23 0\n", - "max fps 0 39 40\n", - "max tps 0.0400718 34 35\n", - "max tnr 1 1 0\n", - "max fnr 1 0.676471 0\n", - "max fpr 0 1 40\n", - "max tpr 0.0400718 1 35\n", - "\n", - "Gains/Lift Table: Avg response rate: 46.58 %, avg score: 51.37 %\n", - "group cumulative_data_fraction lower_threshold lift cumulative_lift response_rate score cumulative_response_rate cumulative_score capture_rate cumulative_capture_rate gain cumulative_gain kolmogorov_smirnov\n", - "------- -------------------------- ----------------- -------- ----------------- --------------- ----------- -------------------------- ------------------ -------------- ------------------------- -------- ----------------- --------------------\n", - "1 0.150685 1 2.14706 2.14706 1 1 1 1 0.323529 0.323529 114.706 114.706 0.323529\n", - "2 0.178082 0.995 2.14706 2.14706 1 0.995 1 0.999231 0.0588235 0.382353 114.706 114.706 0.382353\n", - "3 0.219178 0.99 2.14706 2.14706 1 0.991656 1 0.997811 0.0882353 0.470588 114.706 114.706 0.470588\n", - "4 0.315068 0.949795 2.14706 2.14706 1 0.973764 1 0.990492 0.205882 0.676471 114.706 114.706 0.676471\n", - "5 0.424658 0.801563 1.61029 2.00854 0.75 0.88171 0.935484 0.962419 0.176471 0.852941 61.0294 100.854 0.801659\n", - "6 0.520548 0.545413 0.920168 1.80805 0.428571 0.716521 0.842105 0.917122 0.0882353 0.941176 -7.98319 80.805 0.78733\n", - "7 0.630137 0.0982937 0.268382 1.54028 0.125 0.292412 0.717391 0.808477 0.0294118 0.970588 -73.1618 54.0281 0.637255\n", - "8 0.753425 0.011757 0.238562 1.32727 0.111111 0.0332652 0.618182 0.681624 0.0294118 1 -76.1438 32.7273 0.461538\n", - "9 1 0 0 1 0 0.000462963 0.465753 0.513667 0 1 -100 0 0\n", - "\n", - "Scoring History: \n", - " timestamp duration number_of_trees training_rmse training_logloss training_auc training_pr_auc training_lift training_classification_error validation_rmse validation_logloss validation_auc validation_pr_auc validation_lift validation_classification_error\n", - "-- ------------------- ---------- ----------------- --------------- ------------------ -------------- ----------------- --------------- ------------------------------- ----------------- -------------------- ---------------- ------------------- ----------------- ---------------------------------\n", - " 2023-11-14 12:47:00 0.015 sec 0 nan nan nan nan nan nan nan nan nan nan nan nan\n", - " 2023-11-14 12:47:00 0.101 sec 1 0.23112 0.449511 0.976356 0.988018 1.59314 0.0526316 0.30043 1.12362 0.940422 0.925195 2.06448 0.123288\n", - " 2023-11-14 12:47:00 0.122 sec 2 0.258064 0.389941 0.96637 0.982409 1.59314 0.0752688 0.279148 0.23564 0.967949 0.967718 2.14706 0.0958904\n", - " 2023-11-14 12:47:00 0.139 sec 3 0.25253 0.213377 0.968911 0.981354 1.59314 0.06639 0.271568 0.224495 0.975113 0.975009 2.14706 0.0821918\n", - " 2023-11-14 12:47:00 0.152 sec 4 0.262609 0.566757 0.954216 0.963496 1.55888 0.0777385 0.274362 0.231574 0.97549 0.975493 2.14706 0.0684932\n", - " 2023-11-14 12:47:00 0.173 sec 5 0.259948 0.545813 0.956211 0.963805 1.5585 0.0745763 0.278785 0.240588 0.974359 0.97403 2.14706 0.0684932\n", - " 2023-11-14 12:47:00 0.187 sec 6 0.248237 0.307112 0.967073 0.975306 1.57654 0.0709677 0.275574 0.232917 0.976621 0.976325 2.14706 0.0684932\n", - " 2023-11-14 12:47:00 0.197 sec 7 0.243779 0.30005 0.969106 0.976911 1.57671 0.0702875 0.278623 0.239107 0.976621 0.976544 2.14706 0.0684932\n", - " 2023-11-14 12:47:00 0.209 sec 8 0.241162 0.297589 0.970715 0.977278 1.57601 0.0660377 0.279283 0.243906 0.975867 0.976283 2.14706 0.0684932\n", - " 2023-11-14 12:47:00 0.223 sec 9 0.244287 0.301926 0.968862 0.976321 1.57582 0.0685358 0.275043 0.23347 0.975867 0.976283 2.14706 0.0684932\n", - " 2023-11-14 12:47:00 0.233 sec 10 0.237494 0.291928 0.971572 0.977918 1.57601 0.0650155 0.273072 0.230615 0.975867 0.976703 2.14706 0.0684932\n", - "\n", - "Variable Importances: \n", - "variable relative_importance scaled_importance percentage\n", - "------------ --------------------- ------------------- ------------\n", - "displacement 626.203 1 0.576783\n", - "weight 322.56 0.515104 0.297103\n", - "power 98.1161 0.156684 0.0903727\n", - "year 36.4219 0.0581632 0.0335475\n", - "acceleration 2.38213 0.00380408 0.00219413\n", - "\n", - "[tips]\n", - "Use `model.explain()` to inspect the model.\n", - "--\n", - "Use `h2o.display.toggle_user_tips()` to switch on/off this section." - ] - }, - "execution_count": 2, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "# Configure DRF algorithm and train a model.\n", "\n", @@ -1451,19 +49,10 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": null, "id": "29eb0722", "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/Users/marek/opt/anaconda3/envs/mlflow/lib/python3.11/site-packages/_distutils_hack/__init__.py:33: UserWarning: Setuptools is replacing distutils.\n", - " warnings.warn(\"Setuptools is replacing distutils.\")\n" - ] - } - ], + "outputs": [], "source": [ "# Log the model to an MLFlow reqistry.\n", "\n", @@ -1482,154 +71,10 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": null, "id": "bed1dafe", "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
predictp0p1cal_p0cal_p1
000.9900000.0100000.9936180.006382
101.0000000.0000000.9941000.005900
200.9718180.0281820.9926390.007361
300.5139320.4860680.7834780.216522
400.6410560.3589440.9080930.091907
..................
40100.3402440.6597560.4784110.521589
40200.2319990.7680010.2805460.719454
40300.4565240.5434760.6968670.303133
40400.4174570.5825430.6280170.371983
40510.1966310.8033690.2277210.772279
\n", - "

406 rows × 5 columns

\n", - "
" - ], - "text/plain": [ - " predict p0 p1 cal_p0 cal_p1\n", - "0 0 0.990000 0.010000 0.993618 0.006382\n", - "1 0 1.000000 0.000000 0.994100 0.005900\n", - "2 0 0.971818 0.028182 0.992639 0.007361\n", - "3 0 0.513932 0.486068 0.783478 0.216522\n", - "4 0 0.641056 0.358944 0.908093 0.091907\n", - ".. ... ... ... ... ...\n", - "401 0 0.340244 0.659756 0.478411 0.521589\n", - "402 0 0.231999 0.768001 0.280546 0.719454\n", - "403 0 0.456524 0.543476 0.696867 0.303133\n", - "404 0 0.417457 0.582543 0.628017 0.371983\n", - "405 1 0.196631 0.803369 0.227721 0.772279\n", - "\n", - "[406 rows x 5 columns]" - ] - }, - "execution_count": 4, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "# Load model from the MLFlow registry and score with the model.\n", "\n",