diff --git a/examples/demo_exponentiated_gradient_reduction.ipynb b/examples/demo_exponentiated_gradient_reduction.ipynb index 40c6ca70..d4c23a0e 100644 --- a/examples/demo_exponentiated_gradient_reduction.ipynb +++ b/examples/demo_exponentiated_gradient_reduction.ipynb @@ -1,511 +1,791 @@ { - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Exponentiated Gradient Reduction\n", - "\n", - "Exponentiated gradient reduction is an in-processing technique that reduces fair classification to a sequence of cost-sensitive classification problems, returning a randomized classifier with the lowest empirical error subject to \n", - "fair classification constraints. The code for exponentiated gradient reduction wraps the source class \n", - "`fairlearn.reductions.ExponentiatedGradient` available in the https://github.com/fairlearn/fairlearn library,\n", - "licensed under the MIT Licencse, Copyright Microsoft Corporation.\n", - "\n", - "This version of exponentiated gradient reduction (implemented in `aif360.algorithms`) wraps the sklearn compatible version of exponentiated gradient reduction implemented in `aif360.sklearn`. For a detailed tutorial on sklearn compatible exponentiated gradient reduction see [examples/sklearn/demo_exponentiated_gradient_reduction_sklearn.ipynb](sklearn/demo_exponentiated_gradient_reduction_sklearn.ipynb). " - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [], - "source": [ - "import warnings\n", - "warnings.filterwarnings(\"ignore\", category=FutureWarning)" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [], - "source": [ - "# Load all necessary packages\n", - "from aif360.metrics import BinaryLabelDatasetMetric\n", - "from aif360.metrics import ClassificationMetric\n", - "\n", - "from aif360.algorithms.preprocessing.optim_preproc_helpers.data_preproc_functions import load_preproc_data_adult\n", - "\n", - "from aif360.algorithms.inprocessing.exponentiated_gradient_reduction import ExponentiatedGradientReduction\n", - "\n", - "from sklearn.linear_model import LogisticRegression\n", - "from sklearn.preprocessing import MaxAbsScaler\n", - "from sklearn.metrics import accuracy_score\n", - "\n", - "from IPython.display import Markdown, display\n", - "\n", - "import numpy as np" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### Load dataset and set options" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [], - "source": [ - "# Get the dataset and split into train and test\n", - "dataset_orig = load_preproc_data_adult()\n", - "\n", - "privileged_groups = [{'sex': 1}]\n", - "unprivileged_groups = [{'sex': 0}]\n", - "\n", - "np.random.seed(0)\n", - "dataset_orig_train, dataset_orig_test = dataset_orig.split([0.7], shuffle=True)" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [ + "cells": [ { - "data": { - "text/markdown": [ - "#### Training Dataset shape" + "cell_type": "markdown", + "source": [ + "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/Trusted-AI/AIF360/blob/master/examples/demo_exponentiated_gradient_reduction.ipynb)" ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" + "metadata": { + "id": "n06sYQJT-BDU" + } }, { - "name": "stdout", - "output_type": "stream", - "text": [ - "(34189, 18)\n" - ] + "cell_type": "markdown", + "metadata": { + "id": "i8ZhANU1zddQ" + }, + "source": [ + "# Exponentiated Gradient Reduction\n", + "\n", + "Exponentiated gradient reduction is an in-processing technique that reduces fair classification to a sequence of cost-sensitive classification problems, returning a randomized classifier with the lowest empirical error subject to\n", + "fair classification constraints. The code for exponentiated gradient reduction wraps the source class\n", + "`fairlearn.reductions.ExponentiatedGradient` available in the https://github.com/fairlearn/fairlearn library,\n", + "licensed under the MIT Licencse, Copyright Microsoft Corporation.\n", + "\n", + "This version of exponentiated gradient reduction (implemented in `aif360.algorithms`) wraps the sklearn compatible version of exponentiated gradient reduction implemented in `aif360.sklearn`. For a detailed tutorial on sklearn compatible exponentiated gradient reduction see [examples/sklearn/demo_exponentiated_gradient_reduction_sklearn.ipynb](sklearn/demo_exponentiated_gradient_reduction_sklearn.ipynb)." + ] }, { - "data": { - "text/markdown": [ - "#### Favorable and unfavorable labels" + "cell_type": "code", + "source": [ + "!pip install aif360\n", + "!pip install aif360[LawSchoolGPA]\n", + "!pip install aif360[Reductions]" ], - "text/plain": [ - "" + "metadata": { + "id": "x9ak1H2JziK-", + "outputId": "5ce9ec90-e7ad-4568-ca5d-1814a65076bc", + "colab": { + "base_uri": "https://localhost:8080/" + } + }, + "execution_count": 1, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Collecting aif360\n", + " Downloading aif360-0.5.0-py3-none-any.whl (214 kB)\n", + "\u001b[?25l \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m0.0/214.1 kB\u001b[0m \u001b[31m?\u001b[0m eta \u001b[36m-:--:--\u001b[0m\r\u001b[2K \u001b[91m━━━━━━━━━━━━━\u001b[0m\u001b[90m╺\u001b[0m\u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m71.7/214.1 kB\u001b[0m \u001b[31m1.9 MB/s\u001b[0m eta \u001b[36m0:00:01\u001b[0m\r\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m214.1/214.1 kB\u001b[0m \u001b[31m3.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hRequirement already satisfied: numpy>=1.16 in /usr/local/lib/python3.10/dist-packages (from aif360) (1.23.5)\n", + "Requirement already satisfied: scipy>=1.2.0 in /usr/local/lib/python3.10/dist-packages (from aif360) (1.11.2)\n", + "Requirement already satisfied: pandas>=0.24.0 in /usr/local/lib/python3.10/dist-packages (from aif360) (1.5.3)\n", + "Requirement already satisfied: scikit-learn>=1.0 in /usr/local/lib/python3.10/dist-packages (from aif360) (1.2.2)\n", + "Requirement already satisfied: matplotlib in /usr/local/lib/python3.10/dist-packages (from aif360) (3.7.1)\n", + "Requirement already satisfied: python-dateutil>=2.8.1 in /usr/local/lib/python3.10/dist-packages (from pandas>=0.24.0->aif360) (2.8.2)\n", + "Requirement already satisfied: pytz>=2020.1 in /usr/local/lib/python3.10/dist-packages (from pandas>=0.24.0->aif360) (2023.3.post1)\n", + "Requirement already satisfied: joblib>=1.1.1 in /usr/local/lib/python3.10/dist-packages (from scikit-learn>=1.0->aif360) (1.3.2)\n", + "Requirement already satisfied: threadpoolctl>=2.0.0 in /usr/local/lib/python3.10/dist-packages (from scikit-learn>=1.0->aif360) (3.2.0)\n", + "Requirement already satisfied: contourpy>=1.0.1 in /usr/local/lib/python3.10/dist-packages (from matplotlib->aif360) (1.1.0)\n", + "Requirement already satisfied: cycler>=0.10 in /usr/local/lib/python3.10/dist-packages (from matplotlib->aif360) (0.11.0)\n", + "Requirement already satisfied: fonttools>=4.22.0 in /usr/local/lib/python3.10/dist-packages (from matplotlib->aif360) (4.42.1)\n", + "Requirement already satisfied: kiwisolver>=1.0.1 in /usr/local/lib/python3.10/dist-packages (from matplotlib->aif360) (1.4.5)\n", + "Requirement already satisfied: packaging>=20.0 in /usr/local/lib/python3.10/dist-packages (from matplotlib->aif360) (23.1)\n", + "Requirement already satisfied: pillow>=6.2.0 in /usr/local/lib/python3.10/dist-packages (from matplotlib->aif360) (9.4.0)\n", + "Requirement already satisfied: pyparsing>=2.3.1 in /usr/local/lib/python3.10/dist-packages (from matplotlib->aif360) (3.1.1)\n", + "Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.10/dist-packages (from python-dateutil>=2.8.1->pandas>=0.24.0->aif360) (1.16.0)\n", + "Installing collected packages: aif360\n", + "Successfully installed aif360-0.5.0\n", + "Requirement already satisfied: aif360[LawSchoolGPA] in /usr/local/lib/python3.10/dist-packages (0.5.0)\n", + "Requirement already satisfied: numpy>=1.16 in /usr/local/lib/python3.10/dist-packages (from aif360[LawSchoolGPA]) (1.23.5)\n", + "Requirement already satisfied: scipy>=1.2.0 in /usr/local/lib/python3.10/dist-packages (from aif360[LawSchoolGPA]) (1.11.2)\n", + "Requirement already satisfied: pandas>=0.24.0 in /usr/local/lib/python3.10/dist-packages (from aif360[LawSchoolGPA]) (1.5.3)\n", + "Requirement already satisfied: scikit-learn>=1.0 in /usr/local/lib/python3.10/dist-packages (from aif360[LawSchoolGPA]) (1.2.2)\n", + "Requirement already satisfied: matplotlib in /usr/local/lib/python3.10/dist-packages (from aif360[LawSchoolGPA]) (3.7.1)\n", + "Collecting tempeh (from aif360[LawSchoolGPA])\n", + " Downloading tempeh-0.1.12-py3-none-any.whl (39 kB)\n", + "Requirement already satisfied: python-dateutil>=2.8.1 in /usr/local/lib/python3.10/dist-packages (from pandas>=0.24.0->aif360[LawSchoolGPA]) (2.8.2)\n", + "Requirement already satisfied: pytz>=2020.1 in /usr/local/lib/python3.10/dist-packages (from pandas>=0.24.0->aif360[LawSchoolGPA]) (2023.3.post1)\n", + "Requirement already satisfied: joblib>=1.1.1 in /usr/local/lib/python3.10/dist-packages (from scikit-learn>=1.0->aif360[LawSchoolGPA]) (1.3.2)\n", + "Requirement already satisfied: threadpoolctl>=2.0.0 in /usr/local/lib/python3.10/dist-packages (from scikit-learn>=1.0->aif360[LawSchoolGPA]) (3.2.0)\n", + "Requirement already satisfied: contourpy>=1.0.1 in /usr/local/lib/python3.10/dist-packages (from matplotlib->aif360[LawSchoolGPA]) (1.1.0)\n", + "Requirement already satisfied: cycler>=0.10 in /usr/local/lib/python3.10/dist-packages (from matplotlib->aif360[LawSchoolGPA]) (0.11.0)\n", + "Requirement already satisfied: fonttools>=4.22.0 in /usr/local/lib/python3.10/dist-packages (from matplotlib->aif360[LawSchoolGPA]) (4.42.1)\n", + "Requirement already satisfied: kiwisolver>=1.0.1 in /usr/local/lib/python3.10/dist-packages (from matplotlib->aif360[LawSchoolGPA]) (1.4.5)\n", + "Requirement already satisfied: packaging>=20.0 in /usr/local/lib/python3.10/dist-packages (from matplotlib->aif360[LawSchoolGPA]) (23.1)\n", + "Requirement already satisfied: pillow>=6.2.0 in /usr/local/lib/python3.10/dist-packages (from matplotlib->aif360[LawSchoolGPA]) (9.4.0)\n", + "Requirement already satisfied: pyparsing>=2.3.1 in /usr/local/lib/python3.10/dist-packages (from matplotlib->aif360[LawSchoolGPA]) (3.1.1)\n", + "Collecting memory-profiler (from tempeh->aif360[LawSchoolGPA])\n", + " Downloading memory_profiler-0.61.0-py3-none-any.whl (31 kB)\n", + "Requirement already satisfied: pytest in /usr/local/lib/python3.10/dist-packages (from tempeh->aif360[LawSchoolGPA]) (7.4.1)\n", + "Requirement already satisfied: requests in /usr/local/lib/python3.10/dist-packages (from tempeh->aif360[LawSchoolGPA]) (2.31.0)\n", + "Collecting shap (from tempeh->aif360[LawSchoolGPA])\n", + " Downloading shap-0.42.1-cp310-cp310-manylinux_2_12_x86_64.manylinux2010_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (547 kB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m547.9/547.9 kB\u001b[0m \u001b[31m14.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hRequirement already satisfied: six>=1.5 in /usr/local/lib/python3.10/dist-packages (from python-dateutil>=2.8.1->pandas>=0.24.0->aif360[LawSchoolGPA]) (1.16.0)\n", + "Requirement already satisfied: psutil in /usr/local/lib/python3.10/dist-packages (from memory-profiler->tempeh->aif360[LawSchoolGPA]) (5.9.5)\n", + "Requirement already satisfied: iniconfig in /usr/local/lib/python3.10/dist-packages (from pytest->tempeh->aif360[LawSchoolGPA]) (2.0.0)\n", + "Requirement already satisfied: pluggy<2.0,>=0.12 in /usr/local/lib/python3.10/dist-packages (from pytest->tempeh->aif360[LawSchoolGPA]) (1.3.0)\n", + "Requirement already satisfied: exceptiongroup>=1.0.0rc8 in /usr/local/lib/python3.10/dist-packages (from pytest->tempeh->aif360[LawSchoolGPA]) (1.1.3)\n", + "Requirement already satisfied: tomli>=1.0.0 in /usr/local/lib/python3.10/dist-packages (from pytest->tempeh->aif360[LawSchoolGPA]) (2.0.1)\n", + "Requirement already satisfied: charset-normalizer<4,>=2 in /usr/local/lib/python3.10/dist-packages (from requests->tempeh->aif360[LawSchoolGPA]) (3.2.0)\n", + "Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.10/dist-packages (from requests->tempeh->aif360[LawSchoolGPA]) (3.4)\n", + "Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.10/dist-packages (from requests->tempeh->aif360[LawSchoolGPA]) (2.0.4)\n", + "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.10/dist-packages (from requests->tempeh->aif360[LawSchoolGPA]) (2023.7.22)\n", + "Requirement already satisfied: tqdm>=4.27.0 in /usr/local/lib/python3.10/dist-packages (from shap->tempeh->aif360[LawSchoolGPA]) (4.66.1)\n", + "Collecting slicer==0.0.7 (from shap->tempeh->aif360[LawSchoolGPA])\n", + " Downloading slicer-0.0.7-py3-none-any.whl (14 kB)\n", + "Requirement already satisfied: numba in /usr/local/lib/python3.10/dist-packages (from shap->tempeh->aif360[LawSchoolGPA]) (0.56.4)\n", + "Requirement already satisfied: cloudpickle in /usr/local/lib/python3.10/dist-packages (from shap->tempeh->aif360[LawSchoolGPA]) (2.2.1)\n", + "Requirement already satisfied: llvmlite<0.40,>=0.39.0dev0 in /usr/local/lib/python3.10/dist-packages (from numba->shap->tempeh->aif360[LawSchoolGPA]) (0.39.1)\n", + "Requirement already satisfied: setuptools in /usr/local/lib/python3.10/dist-packages (from numba->shap->tempeh->aif360[LawSchoolGPA]) (67.7.2)\n", + "Installing collected packages: slicer, memory-profiler, shap, tempeh\n", + "Successfully installed memory-profiler-0.61.0 shap-0.42.1 slicer-0.0.7 tempeh-0.1.12\n", + "Requirement already satisfied: aif360[Reductions] in /usr/local/lib/python3.10/dist-packages (0.5.0)\n", + "Requirement already satisfied: numpy>=1.16 in /usr/local/lib/python3.10/dist-packages (from aif360[Reductions]) (1.23.5)\n", + "Requirement already satisfied: scipy>=1.2.0 in /usr/local/lib/python3.10/dist-packages (from aif360[Reductions]) (1.11.2)\n", + "Requirement already satisfied: pandas>=0.24.0 in /usr/local/lib/python3.10/dist-packages (from aif360[Reductions]) (1.5.3)\n", + "Requirement already satisfied: scikit-learn>=1.0 in /usr/local/lib/python3.10/dist-packages (from aif360[Reductions]) (1.2.2)\n", + "Requirement already satisfied: matplotlib in /usr/local/lib/python3.10/dist-packages (from aif360[Reductions]) (3.7.1)\n", + "Collecting fairlearn~=0.7 (from aif360[Reductions])\n", + " Downloading fairlearn-0.9.0-py3-none-any.whl (231 kB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m231.5/231.5 kB\u001b[0m \u001b[31m3.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hRequirement already satisfied: python-dateutil>=2.8.1 in /usr/local/lib/python3.10/dist-packages (from pandas>=0.24.0->aif360[Reductions]) (2.8.2)\n", + "Requirement already satisfied: pytz>=2020.1 in /usr/local/lib/python3.10/dist-packages (from pandas>=0.24.0->aif360[Reductions]) (2023.3.post1)\n", + "Requirement already satisfied: joblib>=1.1.1 in /usr/local/lib/python3.10/dist-packages (from scikit-learn>=1.0->aif360[Reductions]) (1.3.2)\n", + "Requirement already satisfied: threadpoolctl>=2.0.0 in /usr/local/lib/python3.10/dist-packages (from scikit-learn>=1.0->aif360[Reductions]) (3.2.0)\n", + "Requirement already satisfied: contourpy>=1.0.1 in /usr/local/lib/python3.10/dist-packages (from matplotlib->aif360[Reductions]) (1.1.0)\n", + "Requirement already satisfied: cycler>=0.10 in /usr/local/lib/python3.10/dist-packages (from matplotlib->aif360[Reductions]) (0.11.0)\n", + "Requirement already satisfied: fonttools>=4.22.0 in /usr/local/lib/python3.10/dist-packages (from matplotlib->aif360[Reductions]) (4.42.1)\n", + "Requirement already satisfied: kiwisolver>=1.0.1 in /usr/local/lib/python3.10/dist-packages (from matplotlib->aif360[Reductions]) (1.4.5)\n", + "Requirement already satisfied: packaging>=20.0 in /usr/local/lib/python3.10/dist-packages (from matplotlib->aif360[Reductions]) (23.1)\n", + "Requirement already satisfied: pillow>=6.2.0 in /usr/local/lib/python3.10/dist-packages (from matplotlib->aif360[Reductions]) (9.4.0)\n", + "Requirement already satisfied: pyparsing>=2.3.1 in /usr/local/lib/python3.10/dist-packages (from matplotlib->aif360[Reductions]) (3.1.1)\n", + "Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.10/dist-packages (from python-dateutil>=2.8.1->pandas>=0.24.0->aif360[Reductions]) (1.16.0)\n", + "Installing collected packages: fairlearn\n", + "Successfully installed fairlearn-0.9.0\n" + ] + } ] - }, - "metadata": {}, - "output_type": "display_data" }, { - "name": "stdout", - "output_type": "stream", - "text": [ - "1.0 0.0\n" - ] + "cell_type": "code", + "execution_count": 2, + "metadata": { + "id": "c2aYylUxzddc" + }, + "outputs": [], + "source": [ + "import warnings\n", + "warnings.filterwarnings(\"ignore\", category=FutureWarning)" + ] }, { - "data": { - "text/markdown": [ - "#### Protected attribute names" + "cell_type": "code", + "execution_count": 3, + "metadata": { + "id": "fUg_BAAJzddf", + "outputId": "5f2c3254-f1b3-461a-ba01-43526989e336", + "colab": { + "base_uri": "https://localhost:8080/" + } + }, + "outputs": [ + { + "output_type": "stream", + "name": "stderr", + "text": [ + "WARNING:root:\n", + "`load_boston` has been removed from scikit-learn since version 1.2.\n", + "\n", + "The Boston housing prices dataset has an ethical problem: as\n", + "investigated in [1], the authors of this dataset engineered a\n", + "non-invertible variable \"B\" assuming that racial self-segregation had a\n", + "positive impact on house prices [2]. Furthermore the goal of the\n", + "research that led to the creation of this dataset was to study the\n", + "impact of air quality but it did not give adequate demonstration of the\n", + "validity of this assumption.\n", + "\n", + "The scikit-learn maintainers therefore strongly discourage the use of\n", + "this dataset unless the purpose of the code is to study and educate\n", + "about ethical issues in data science and machine learning.\n", + "\n", + "In this special case, you can fetch the dataset from the original\n", + "source::\n", + "\n", + " import pandas as pd\n", + " import numpy as np\n", + "\n", + " data_url = \"http://lib.stat.cmu.edu/datasets/boston\"\n", + " raw_df = pd.read_csv(data_url, sep=\"\\s+\", skiprows=22, header=None)\n", + " data = np.hstack([raw_df.values[::2, :], raw_df.values[1::2, :2]])\n", + " target = raw_df.values[1::2, 2]\n", + "\n", + "Alternative datasets include the California housing dataset and the\n", + "Ames housing dataset. You can load the datasets as follows::\n", + "\n", + " from sklearn.datasets import fetch_california_housing\n", + " housing = fetch_california_housing()\n", + "\n", + "for the California housing dataset and::\n", + "\n", + " from sklearn.datasets import fetch_openml\n", + " housing = fetch_openml(name=\"house_prices\", as_frame=True)\n", + "\n", + "for the Ames housing dataset.\n", + "\n", + "[1] M Carlisle.\n", + "\"Racist data destruction?\"\n", + "\n", + "\n", + "[2] Harrison Jr, David, and Daniel L. Rubinfeld.\n", + "\"Hedonic housing prices and the demand for clean air.\"\n", + "Journal of environmental economics and management 5.1 (1978): 81-102.\n", + "\n", + ": LawSchoolGPADataset will be unavailable. To install, run:\n", + "pip install 'aif360[LawSchoolGPA]'\n" + ] + } ], - "text/plain": [ - "" + "source": [ + "# Load all necessary packages\n", + "from aif360.metrics import BinaryLabelDatasetMetric\n", + "from aif360.metrics import ClassificationMetric\n", + "\n", + "from aif360.algorithms.preprocessing.optim_preproc_helpers.data_preproc_functions import load_preproc_data_adult\n", + "\n", + "from aif360.algorithms.inprocessing.exponentiated_gradient_reduction import ExponentiatedGradientReduction\n", + "\n", + "from sklearn.linear_model import LogisticRegression\n", + "from sklearn.preprocessing import MaxAbsScaler\n", + "from sklearn.metrics import accuracy_score\n", + "\n", + "from IPython.display import Markdown, display\n", + "import urllib\n", + "import numpy as np" ] - }, - "metadata": {}, - "output_type": "display_data" }, { - "name": "stdout", - "output_type": "stream", - "text": [ - "['sex', 'race']\n" - ] + "cell_type": "markdown", + "metadata": { + "id": "-NMOHx5Ozddi" + }, + "source": [ + "#### Load dataset and set options" + ] }, { - "data": { - "text/markdown": [ - "#### Privileged and unprivileged protected attribute values" + "cell_type": "code", + "source": [ + "import os\n", + "import aif360\n", + "\n", + "# Obtain the location where it is installed\n", + "LIB_PATH = aif360.__file__.rsplit(\"aif360\", 1)[0]\n", + "\n", + "# check if the data got download properly\n", + "def check_data_or_download(destn, files, data_source_directory):\n", + " check = all(item in os.listdir(destn) for item in files)\n", + " if check:\n", + " print(\"Adult dataset is available for us\")\n", + " else:\n", + " print(\"Some files are missing. Downloading now.\")\n", + " for data_file in files:\n", + " _ = urllib.request.urlretrieve(data_source_directory + data_file,\n", + " os.path.join(destn, data_file))\n", + "\n", + "# Download adult dataset\n", + "data_source_directory = \"https://archive.ics.uci.edu/ml/machine-learning-databases/adult/\"\n", + "destn = os.path.join(LIB_PATH, \"aif360\", \"data\", \"raw\", \"adult\")\n", + "files = [\"adult.data\", \"adult.test\", \"adult.names\"]\n", + "\n", + "check_data_or_download(destn, files, data_source_directory)" ], - "text/plain": [ - "" + "metadata": { + "id": "H3kJd9cr00if", + "outputId": "f0a02043-4906-4755-d748-3be33159311f", + "colab": { + "base_uri": "https://localhost:8080/" + } + }, + "execution_count": 4, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Some files are missing. Downloading now.\n" + ] + } ] - }, - "metadata": {}, - "output_type": "display_data" }, { - "name": "stdout", - "output_type": "stream", - "text": [ - "[array([1.]), array([1.])] [array([0.]), array([0.])]\n" - ] + "cell_type": "code", + "execution_count": 5, + "metadata": { + "id": "1WWKW9nEzddk" + }, + "outputs": [], + "source": [ + "# Get the dataset and split into train and test\n", + "dataset_orig = load_preproc_data_adult()\n", + "\n", + "privileged_groups = [{'sex': 1}]\n", + "unprivileged_groups = [{'sex': 0}]\n", + "\n", + "np.random.seed(0)\n", + "dataset_orig_train, dataset_orig_test = dataset_orig.split([0.7], shuffle=True)" + ] }, { - "data": { - "text/markdown": [ - "#### Dataset feature names" + "cell_type": "code", + "execution_count": 6, + "metadata": { + "id": "8uJ1DS2Jzddq", + "outputId": "8eac70c1-676b-4e3c-996c-50e8202d948e", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 324 + } + }, + "outputs": [ + { + "output_type": "display_data", + "data": { + "text/plain": [ + "" + ], + "text/markdown": "#### Training Dataset shape" + }, + "metadata": {} + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + "(34189, 18)\n" + ] + }, + { + "output_type": "display_data", + "data": { + "text/plain": [ + "" + ], + "text/markdown": "#### Favorable and unfavorable labels" + }, + "metadata": {} + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + "1.0 0.0\n" + ] + }, + { + "output_type": "display_data", + "data": { + "text/plain": [ + "" + ], + "text/markdown": "#### Protected attribute names" + }, + "metadata": {} + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + "['sex', 'race']\n" + ] + }, + { + "output_type": "display_data", + "data": { + "text/plain": [ + "" + ], + "text/markdown": "#### Privileged and unprivileged protected attribute values" + }, + "metadata": {} + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + "[array([1.]), array([1.])] [array([0.]), array([0.])]\n" + ] + }, + { + "output_type": "display_data", + "data": { + "text/plain": [ + "" + ], + "text/markdown": "#### Dataset feature names" + }, + "metadata": {} + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + "['race', 'sex', 'Age (decade)=10', 'Age (decade)=20', 'Age (decade)=30', 'Age (decade)=40', 'Age (decade)=50', 'Age (decade)=60', 'Age (decade)=>=70', 'Education Years=6', 'Education Years=7', 'Education Years=8', 'Education Years=9', 'Education Years=10', 'Education Years=11', 'Education Years=12', 'Education Years=<6', 'Education Years=>12']\n" + ] + } ], - "text/plain": [ - "" + "source": [ + "# print out some labels, names, etc.\n", + "display(Markdown(\"#### Training Dataset shape\"))\n", + "print(dataset_orig_train.features.shape)\n", + "display(Markdown(\"#### Favorable and unfavorable labels\"))\n", + "print(dataset_orig_train.favorable_label, dataset_orig_train.unfavorable_label)\n", + "display(Markdown(\"#### Protected attribute names\"))\n", + "print(dataset_orig_train.protected_attribute_names)\n", + "display(Markdown(\"#### Privileged and unprivileged protected attribute values\"))\n", + "print(dataset_orig_train.privileged_protected_attributes,\n", + " dataset_orig_train.unprivileged_protected_attributes)\n", + "display(Markdown(\"#### Dataset feature names\"))\n", + "print(dataset_orig_train.feature_names)" ] - }, - "metadata": {}, - "output_type": "display_data" }, { - "name": "stdout", - "output_type": "stream", - "text": [ - "['race', 'sex', 'Age (decade)=10', 'Age (decade)=20', 'Age (decade)=30', 'Age (decade)=40', 'Age (decade)=50', 'Age (decade)=60', 'Age (decade)=>=70', 'Education Years=6', 'Education Years=7', 'Education Years=8', 'Education Years=9', 'Education Years=10', 'Education Years=11', 'Education Years=12', 'Education Years=<6', 'Education Years=>12']\n" - ] - } - ], - "source": [ - "# print out some labels, names, etc.\n", - "display(Markdown(\"#### Training Dataset shape\"))\n", - "print(dataset_orig_train.features.shape)\n", - "display(Markdown(\"#### Favorable and unfavorable labels\"))\n", - "print(dataset_orig_train.favorable_label, dataset_orig_train.unfavorable_label)\n", - "display(Markdown(\"#### Protected attribute names\"))\n", - "print(dataset_orig_train.protected_attribute_names)\n", - "display(Markdown(\"#### Privileged and unprivileged protected attribute values\"))\n", - "print(dataset_orig_train.privileged_protected_attributes, \n", - " dataset_orig_train.unprivileged_protected_attributes)\n", - "display(Markdown(\"#### Dataset feature names\"))\n", - "print(dataset_orig_train.feature_names)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### Metric for original training data" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [ + "cell_type": "markdown", + "metadata": { + "id": "PD-Va4lxzddu" + }, + "source": [ + "#### Metric for original training data" + ] + }, { - "data": { - "text/markdown": [ - "#### Original training dataset" + "cell_type": "code", + "execution_count": 7, + "metadata": { + "id": "2sTYG2E3zddv", + "outputId": "7ae97420-5a0d-4dab-eed9-f90ef6ca4d53", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 92 + } + }, + "outputs": [ + { + "output_type": "display_data", + "data": { + "text/plain": [ + "" + ], + "text/markdown": "#### Original training dataset" + }, + "metadata": {} + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Train set: Difference in mean outcomes between unprivileged and privileged groups = -0.193075\n", + "Test set: Difference in mean outcomes between unprivileged and privileged groups = -0.198048\n" + ] + } ], - "text/plain": [ - "" + "source": [ + "# Metric for the original dataset\n", + "metric_orig_train = BinaryLabelDatasetMetric(dataset_orig_train,\n", + " unprivileged_groups=unprivileged_groups,\n", + " privileged_groups=privileged_groups)\n", + "display(Markdown(\"#### Original training dataset\"))\n", + "print(\"Train set: Difference in mean outcomes between unprivileged and privileged groups = %f\" % metric_orig_train.mean_difference())\n", + "metric_orig_test = BinaryLabelDatasetMetric(dataset_orig_test,\n", + " unprivileged_groups=unprivileged_groups,\n", + " privileged_groups=privileged_groups)\n", + "print(\"Test set: Difference in mean outcomes between unprivileged and privileged groups = %f\" % metric_orig_test.mean_difference())" ] - }, - "metadata": {}, - "output_type": "display_data" }, { - "name": "stdout", - "output_type": "stream", - "text": [ - "Train set: Difference in mean outcomes between unprivileged and privileged groups = -0.193075\n", - "Test set: Difference in mean outcomes between unprivileged and privileged groups = -0.198048\n" - ] - } - ], - "source": [ - "# Metric for the original dataset\n", - "metric_orig_train = BinaryLabelDatasetMetric(dataset_orig_train, \n", - " unprivileged_groups=unprivileged_groups,\n", - " privileged_groups=privileged_groups)\n", - "display(Markdown(\"#### Original training dataset\"))\n", - "print(\"Train set: Difference in mean outcomes between unprivileged and privileged groups = %f\" % metric_orig_train.mean_difference())\n", - "metric_orig_test = BinaryLabelDatasetMetric(dataset_orig_test, \n", - " unprivileged_groups=unprivileged_groups,\n", - " privileged_groups=privileged_groups)\n", - "print(\"Test set: Difference in mean outcomes between unprivileged and privileged groups = %f\" % metric_orig_test.mean_difference())" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": {}, - "outputs": [ - { - "data": { - "text/markdown": [ - "#### Scaled dataset - Verify that the scaling does not affect the group label statistics" + "cell_type": "code", + "execution_count": 8, + "metadata": { + "id": "ooGixHrjzddw", + "outputId": "f5ef12fc-3d79-4fd6-f5ec-22752d9faf15", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 92 + } + }, + "outputs": [ + { + "output_type": "display_data", + "data": { + "text/plain": [ + "" + ], + "text/markdown": "#### Scaled dataset - Verify that the scaling does not affect the group label statistics" + }, + "metadata": {} + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Train set: Difference in mean outcomes between unprivileged and privileged groups = -0.193075\n", + "Test set: Difference in mean outcomes between unprivileged and privileged groups = -0.198048\n" + ] + } ], - "text/plain": [ - "" + "source": [ + "min_max_scaler = MaxAbsScaler()\n", + "dataset_orig_train.features = min_max_scaler.fit_transform(dataset_orig_train.features)\n", + "dataset_orig_test.features = min_max_scaler.transform(dataset_orig_test.features)\n", + "metric_scaled_train = BinaryLabelDatasetMetric(dataset_orig_train,\n", + " unprivileged_groups=unprivileged_groups,\n", + " privileged_groups=privileged_groups)\n", + "display(Markdown(\"#### Scaled dataset - Verify that the scaling does not affect the group label statistics\"))\n", + "print(\"Train set: Difference in mean outcomes between unprivileged and privileged groups = %f\" % metric_scaled_train.mean_difference())\n", + "metric_scaled_test = BinaryLabelDatasetMetric(dataset_orig_test,\n", + " unprivileged_groups=unprivileged_groups,\n", + " privileged_groups=privileged_groups)\n", + "print(\"Test set: Difference in mean outcomes between unprivileged and privileged groups = %f\" % metric_scaled_test.mean_difference())\n" ] - }, - "metadata": {}, - "output_type": "display_data" }, { - "name": "stdout", - "output_type": "stream", - "text": [ - "Train set: Difference in mean outcomes between unprivileged and privileged groups = -0.193075\n", - "Test set: Difference in mean outcomes between unprivileged and privileged groups = -0.198048\n" - ] - } - ], - "source": [ - "min_max_scaler = MaxAbsScaler()\n", - "dataset_orig_train.features = min_max_scaler.fit_transform(dataset_orig_train.features)\n", - "dataset_orig_test.features = min_max_scaler.transform(dataset_orig_test.features)\n", - "metric_scaled_train = BinaryLabelDatasetMetric(dataset_orig_train, \n", - " unprivileged_groups=unprivileged_groups,\n", - " privileged_groups=privileged_groups)\n", - "display(Markdown(\"#### Scaled dataset - Verify that the scaling does not affect the group label statistics\"))\n", - "print(\"Train set: Difference in mean outcomes between unprivileged and privileged groups = %f\" % metric_scaled_train.mean_difference())\n", - "metric_scaled_test = BinaryLabelDatasetMetric(dataset_orig_test, \n", - " unprivileged_groups=unprivileged_groups,\n", - " privileged_groups=privileged_groups)\n", - "print(\"Test set: Difference in mean outcomes between unprivileged and privileged groups = %f\" % metric_scaled_test.mean_difference())\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Standard Logistic Regression" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [ + "cell_type": "markdown", + "metadata": { + "id": "kFOQg3MVzddx" + }, + "source": [ + "### Standard Logistic Regression" + ] + }, { - "data": { - "text/markdown": [ - "#### Accuracy" + "cell_type": "code", + "execution_count": 9, + "metadata": { + "id": "OQho44_nzddx", + "outputId": "3759b9a6-88e9-45a3-9edd-a0ce4e7ac66a", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 74 + } + }, + "outputs": [ + { + "output_type": "display_data", + "data": { + "text/plain": [ + "" + ], + "text/markdown": "#### Accuracy" + }, + "metadata": {} + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + "0.8042039172865625\n" + ] + } ], - "text/plain": [ - "" + "source": [ + "X_train = dataset_orig_train.features\n", + "y_train = dataset_orig_train.labels.ravel()\n", + "\n", + "lmod = LogisticRegression(solver='lbfgs')\n", + "lmod.fit(X_train, y_train, sample_weight=dataset_orig_train.instance_weights)\n", + "\n", + "X_test = dataset_orig_test.features\n", + "y_test = dataset_orig_test.labels.ravel()\n", + "\n", + "y_pred = lmod.predict(X_test)\n", + "\n", + "display(Markdown(\"#### Accuracy\"))\n", + "lr_acc = accuracy_score(y_test, y_pred)\n", + "print(lr_acc)" ] - }, - "metadata": {}, - "output_type": "display_data" }, { - "name": "stdout", - "output_type": "stream", - "text": [ - "0.8042039172865625\n" - ] - } - ], - "source": [ - "X_train = dataset_orig_train.features\n", - "y_train = dataset_orig_train.labels.ravel()\n", - "\n", - "lmod = LogisticRegression(solver='lbfgs')\n", - "lmod.fit(X_train, y_train, sample_weight=dataset_orig_train.instance_weights)\n", - "\n", - "X_test = dataset_orig_test.features\n", - "y_test = dataset_orig_test.labels.ravel()\n", - "\n", - "y_pred = lmod.predict(X_test)\n", - "\n", - "display(Markdown(\"#### Accuracy\"))\n", - "lr_acc = accuracy_score(y_test, y_pred)\n", - "print(lr_acc)" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": {}, - "outputs": [ - { - "data": { - "text/markdown": [ - "#### Average odds difference" + "cell_type": "code", + "execution_count": 10, + "metadata": { + "id": "OhXpwKpfzddz", + "outputId": "b5fd4ff6-c334-49d1-c196-f2355377f56d", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 74 + } + }, + "outputs": [ + { + "output_type": "display_data", + "data": { + "text/plain": [ + "" + ], + "text/markdown": "#### Average odds difference" + }, + "metadata": {} + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + "-0.27273605621431707\n" + ] + } ], - "text/plain": [ - "" + "source": [ + "dataset_orig_test_pred = dataset_orig_test.copy(deepcopy=True)\n", + "dataset_orig_test_pred.labels = y_pred\n", + "\n", + "# positive class index\n", + "pos_ind = np.where(lmod.classes_ == dataset_orig_train.favorable_label)[0][0]\n", + "dataset_orig_test_pred.scores = lmod.predict_proba(X_test)[:,pos_ind].reshape(-1,1)\n", + "\n", + "metric_test = ClassificationMetric(dataset_orig_test,\n", + " dataset_orig_test_pred,\n", + " unprivileged_groups=unprivileged_groups,\n", + " privileged_groups=privileged_groups)\n", + "display(Markdown(\"#### Average odds difference\"))\n", + "lr_aod = metric_test.average_odds_difference()\n", + "print(lr_aod)" ] - }, - "metadata": {}, - "output_type": "display_data" }, { - "name": "stdout", - "output_type": "stream", - "text": [ - "-0.27273605621431707\n" - ] - } - ], - "source": [ - "dataset_orig_test_pred = dataset_orig_test.copy(deepcopy=True)\n", - "dataset_orig_test_pred.labels = y_pred\n", - "\n", - "# positive class index\n", - "pos_ind = np.where(lmod.classes_ == dataset_orig_train.favorable_label)[0][0]\n", - "dataset_orig_test_pred.scores = lmod.predict_proba(X_test)[:,pos_ind].reshape(-1,1)\n", - "\n", - "metric_test = ClassificationMetric(dataset_orig_test, \n", - " dataset_orig_test_pred,\n", - " unprivileged_groups=unprivileged_groups,\n", - " privileged_groups=privileged_groups)\n", - "display(Markdown(\"#### Average odds difference\"))\n", - "lr_aod = metric_test.average_odds_difference()\n", - "print(lr_aod)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Exponentiated Gradient Reduction" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Choose a base model for the randomized classifer" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [], - "source": [ - "estimator = LogisticRegression(solver='lbfgs', max_iter=1000)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Train the randomized classifier and observe test accuracy. Other options for `constraints` include \"DemographicParity\", \"TruePositiveRateParity\", \"FalsePositiveRateParity\", and \"ErrorRateRatio\"." - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": {}, - "outputs": [ + "cell_type": "markdown", + "metadata": { + "id": "-hd8dcTbzdd0" + }, + "source": [ + "### Exponentiated Gradient Reduction" + ] + }, { - "name": "stderr", - "output_type": "stream", - "text": [ - "A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().\n" - ] - } - ], - "source": [ - "np.random.seed(0) #need for reproducibility\n", - "exp_grad_red = ExponentiatedGradientReduction(estimator=estimator, \n", - " constraints=\"EqualizedOdds\",\n", - " drop_prot_attr=False)\n", - "exp_grad_red.fit(dataset_orig_train)\n", - "exp_grad_red_pred = exp_grad_red.predict(dataset_orig_test)" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": {}, - "outputs": [ + "cell_type": "markdown", + "metadata": { + "id": "8HFqeEAhzdd1" + }, + "source": [ + "Choose a base model for the randomized classifer" + ] + }, { - "data": { - "text/markdown": [ - "#### Accuracy" - ], - "text/plain": [ - "" + "cell_type": "code", + "execution_count": 11, + "metadata": { + "id": "g74S8KKczdd4" + }, + "outputs": [], + "source": [ + "estimator = LogisticRegression(solver='lbfgs', max_iter=1000)" ] - }, - "metadata": {}, - "output_type": "display_data" }, { - "name": "stdout", - "output_type": "stream", - "text": [ - "0.7865283559680611\n" - ] + "cell_type": "markdown", + "metadata": { + "id": "PA838L0rzdd6" + }, + "source": [ + "Train the randomized classifier and observe test accuracy. Other options for `constraints` include \"DemographicParity\", \"TruePositiveRateParity\", \"FalsePositiveRateParity\", and \"ErrorRateRatio\"." + ] }, { - "data": { - "text/markdown": [ - "#### Average odds difference" + "cell_type": "code", + "execution_count": 12, + "metadata": { + "id": "YNvcvDtmzdd7", + "outputId": "32272160-d148-4d6b-e375-dbb630fd4922", + "colab": { + "base_uri": "https://localhost:8080/" + } + }, + "outputs": [ + { + "output_type": "stream", + "name": "stderr", + "text": [ + "/usr/local/lib/python3.10/dist-packages/sklearn/preprocessing/_label.py:116: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().\n", + " y = column_or_1d(y, warn=True)\n" + ] + } ], - "text/plain": [ - "" + "source": [ + "np.random.seed(0) #need for reproducibility\n", + "exp_grad_red = ExponentiatedGradientReduction(estimator=estimator,\n", + " constraints=\"EqualizedOdds\",\n", + " drop_prot_attr=False)\n", + "exp_grad_red.fit(dataset_orig_train)\n", + "exp_grad_red_pred = exp_grad_red.predict(dataset_orig_test)" ] - }, - "metadata": {}, - "output_type": "display_data" }, { - "name": "stdout", - "output_type": "stream", - "text": [ - "0.011012958938905922\n" - ] + "cell_type": "code", + "execution_count": 13, + "metadata": { + "id": "GPKzFOvVzdd8", + "outputId": "1de0b993-c63a-47b3-e83e-63c8d81131dc", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 132 + } + }, + "outputs": [ + { + "output_type": "display_data", + "data": { + "text/plain": [ + "" + ], + "text/markdown": "#### Accuracy" + }, + "metadata": {} + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + "0.7865283559680611\n" + ] + }, + { + "output_type": "display_data", + "data": { + "text/plain": [ + "" + ], + "text/markdown": "#### Average odds difference" + }, + "metadata": {} + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + "0.011012958938905922\n" + ] + } + ], + "source": [ + "metric_test = ClassificationMetric(dataset_orig_test,\n", + " exp_grad_red_pred,\n", + " unprivileged_groups=unprivileged_groups,\n", + " privileged_groups=privileged_groups)\n", + "\n", + "display(Markdown(\"#### Accuracy\"))\n", + "egr_acc = metric_test.accuracy()\n", + "print(egr_acc)\n", + "\n", + "#Check if accuracy is comparable\n", + "assert abs(lr_acc-egr_acc)<0.03\n", + "\n", + "display(Markdown(\"#### Average odds difference\"))\n", + "egr_aod = metric_test.average_odds_difference()\n", + "print(egr_aod)\n", + "\n", + "#Check if average odds difference has improved\n", + "assert abs(egr_aod)