From 585290fc829db32645c1231d5b0385b9e90a0a4c Mon Sep 17 00:00:00 2001 From: Felipe Alex Hofmann Date: Wed, 19 Jul 2023 19:29:25 -0700 Subject: [PATCH] Use property classes in multi table `QualityReport` (#383) * . * Finish general implementation * Final version * Add * Add missing tests * . * Update get_details logic * Address feedback * Fix lint * Minor feedback * More accurate progress bar for multi table Quality Report (#385) * Update bar * Test cases * Add unit tests for generate * Update test --- .../reports/multi_table/quality_report.py | 359 ++-- .../_properties/column_pair_trends.py | 20 +- .../reports/single_table/quality_report.py | 2 +- sdmetrics/warnings.py | 2 - .../test_multi_table_quality_report.py | 78 +- .../test_multi_table_quality_report.py | 1445 ++++------------- 6 files changed, 531 insertions(+), 1375 deletions(-) diff --git a/sdmetrics/reports/multi_table/quality_report.py b/sdmetrics/reports/multi_table/quality_report.py index 88caa2b4..854d7f3c 100644 --- a/sdmetrics/reports/multi_table/quality_report.py +++ b/sdmetrics/reports/multi_table/quality_report.py @@ -1,6 +1,5 @@ """Multi table quality report.""" -import itertools import pickle import sys import warnings @@ -10,34 +9,24 @@ import pkg_resources import tqdm -from sdmetrics.errors import IncomputableMetricError -from sdmetrics.multi_table import ( - CardinalityShapeSimilarity, ContingencySimilarity, CorrelationSimilarity, KSComplement, - TVComplement) -from sdmetrics.reports.multi_table.plot_utils import get_table_relationships_plot -from sdmetrics.reports.single_table.plot_utils import get_column_pairs_plot, get_column_shapes_plot -from sdmetrics.reports.utils import ( - aggregate_metric_results, discretize_and_apply_metric, validate_multi_table_inputs) +from sdmetrics.reports.multi_table._properties import Cardinality, ColumnPairTrends, ColumnShapes +from sdmetrics.reports.utils import validate_multi_table_inputs class QualityReport(): """Multi table quality report. This class creates a quality report for multi-table data. It calculates the quality - score along three properties - Column Shapes, Column Pair Trends, and Parent Child - Relationships. + score along three properties - Column Shapes, Column Pair Trends, and Cardinality. """ - METRICS = { - 'Column Shapes': [KSComplement, TVComplement], - 'Column Pair Trends': [CorrelationSimilarity, ContingencySimilarity], - 'Cardinality': [CardinalityShapeSimilarity], - } - def __init__(self): + self._tables = [] self._overall_quality_score = None - self._metric_results = {} - self._property_breakdown = {} + self._properties_instances = {} + self._properties_scores = {} + self._is_generated = False + self._package_version = None self._property_errors = {} def _print_results(self, out=sys.stdout): @@ -48,16 +37,15 @@ def _print_results(self, out=sys.stdout): out.write( f'\nOverall Quality Score: {round(self._overall_quality_score * 100, 2)}%\n\n') - if len(self._property_breakdown) > 0: - out.write('Properties:\n') + out.write('Properties:\n') - for prop, score in self._property_breakdown.items(): + for property_name, score in self._properties_scores.items(): if not pd.isna(score): - out.write(f'{prop}: {round(score * 100, 2)}%\n') - elif self._property_errors[prop] > 0: - out.write(f'{prop}: Error computing property.\n') + out.write(f'{property_name}: {round(score * 100, 2)}%\n') + elif property_name in self._property_errors: + out.write(f'{property_name}: Error computing property.\n') else: - out.write(f'{prop}: NaN\n') + out.write(f'{property_name}: NaN\n') def generate(self, real_data, synthetic_data, metadata, verbose=True): """Generate report. @@ -70,69 +58,61 @@ def generate(self, real_data, synthetic_data, metadata, verbose=True): metadata (dict): The metadata, which contains each column's data type as well as relationships. verbose (bool): - Whether or not to print report summary and progress. + Whether or not to print the report summary and progress. """ validate_multi_table_inputs(real_data, synthetic_data, metadata) - metrics = list(itertools.chain.from_iterable(self.METRICS.values())) + self._tables = list(real_data.keys()) + + self._properties_instances = { + 'Column Shapes': ColumnShapes(), + 'Column Pair Trends': ColumnPairTrends(), + 'Cardinality': Cardinality() + } + + if verbose: + sys.stdout.write('Generating report ...\n') + + num_columns = [len(table['columns']) for table in metadata['tables'].values()] + num_properties = len(self._properties_instances) + progress_bar = None + for index, property_tuple in enumerate(self._properties_instances.items()): + property_name, property_instance = property_tuple + if verbose: + if property_name == 'Column Shapes': + num_iterations = sum(num_columns) + elif property_name == 'Column Pair Trends': + # for each table, the number of combinations of pairs of columns is + # n * (n - 1) / 2, where n is the number of columns in the table + num_iterations = sum([(n_cols * (n_cols - 1)) // 2 for n_cols in num_columns]) + elif property_name == 'Cardinality': + num_iterations = len(metadata['relationships']) + + progress_bar = tqdm.tqdm(total=num_iterations, file=sys.stdout) + progress_bar.set_description( + f'({index + 1}/{num_properties}) Evaluating {property_name}: ') - for metric in tqdm.tqdm(metrics, desc='Creating report', disable=(not verbose)): try: - self._metric_results[metric.__name__] = metric.compute_breakdown( - real_data, synthetic_data, metadata) - except IncomputableMetricError: - # Metric is not compatible with this dataset. - self._metric_results[metric.__name__] = {} - continue - - for table_name, table_data in real_data.items(): - existing_column_pairs = [] - if table_name in self._metric_results['ContingencySimilarity']: - existing_column_pairs.append( - list(self._metric_results['ContingencySimilarity'][table_name].keys())) - else: - self._metric_results['ContingencySimilarity'][table_name] = {} - - if table_name in self._metric_results['CorrelationSimilarity']: - existing_column_pairs.append( - list(self._metric_results['CorrelationSimilarity'][table_name].keys())) - - additional_results = discretize_and_apply_metric( - real_data[table_name], - synthetic_data[table_name], - metadata['tables'][table_name], - ContingencySimilarity, - existing_column_pairs, - ) - self._metric_results['ContingencySimilarity'][table_name].update(additional_results) - - self._property_breakdown = {} - for prop, metrics in self.METRICS.items(): - prop_scores = [] - num_prop_errors = 0 - if prop == 'Cardinality': - for metric in metrics: - if 'score' in self._metric_results[metric.__name__]: - prop_scores.append(self._metric_results[metric.__name__]['score']) - else: - _, num_metric_errors = aggregate_metric_results( - self._metric_results[metric.__name__]) - num_prop_errors += num_metric_errors - else: - for metric in metrics: - for _, table_breakdowns in self._metric_results[metric.__name__].items(): - _, num_metric_errors = aggregate_metric_results(table_breakdowns) - num_prop_errors += num_metric_errors + self._properties_scores[property_name] = property_instance.get_score( + real_data, synthetic_data, metadata, progress_bar) + except BaseException: + self._properties_scores[property_name] = np.nan + self._property_errors[property_name] = True - self._property_breakdown[prop] = np.nanmean(prop_scores) if ( - len(prop_scores) > 0 - ) else self.get_details(prop)['Quality Score'].mean() - self._property_errors[prop] = num_prop_errors + if verbose: + progress_bar.close() - self._overall_quality_score = np.nanmean(list(self._property_breakdown.values())) + scores = list(self._properties_scores.values()) + self._overall_quality_score = np.nanmean(scores) + self._is_generated = True if verbose: - self._print_results() + self._print_results(sys.stdout) + + def _validate_generated(self): + if not self._is_generated: + raise ValueError( + "The report has not been generated yet. Please call the 'generate' method.") def get_score(self): """Return the overall quality score. @@ -141,20 +121,43 @@ def get_score(self): float The overall quality score. """ + self._validate_generated() + return self._overall_quality_score def get_properties(self): - """Return the property score breakdown. + """Return the score for each property. Returns: pandas.DataFrame - The property score breakdown. + The score for each property. """ + self._validate_generated() + return pd.DataFrame({ - 'Property': self._property_breakdown.keys(), - 'Score': self._property_breakdown.values(), + 'Property': self._properties_scores.keys(), + 'Score': self._properties_scores.values(), }) + def _validate_inputs(self, property_name, table_name): + self._validate_generated() + + valid_properties = list(self._properties_instances.keys()) + if property_name not in valid_properties: + raise ValueError( + f"Invalid property name ('{property_name}'). " + f'It must be one of {valid_properties}.' + ) + + if (table_name is not None) and (table_name not in self._tables): + raise ValueError(f"Unknown table ('{table_name}'). Must be one of {self._tables}.") + + def _validate_visualization(self, property_name, table_name): + self._validate_inputs(property_name, table_name) + if property_name in ['Column Shapes', 'Column Pair Trends'] and table_name is None: + raise ValueError('Table name must be provided when viewing details for ' + f"property '{property_name}'.") + def get_visualization(self, property_name, table_name=None): """Return a visualization for each score for the given property and table. @@ -163,55 +166,15 @@ def get_visualization(self, property_name, table_name=None): The name of the property to return score details for. table_name (str): The table to show scores for. Must be provided for 'Column Shapes' - and 'Column Pair Trends' + and 'Column Pair Trends'. Returns: plotly.graph_objects._figure.Figure A visualization of the requested property's scores. """ - if property_name in ['Column Shapes', 'Column Pair Trends'] and table_name is None: - raise ValueError('Table name must be provided when viewing details for ' - f'property {property_name}.') - - if property_name == 'Column Shapes': - score_breakdowns = { - metric.__name__: self._metric_results[metric.__name__].get(table_name, {}) - for metric in self.METRICS.get(property_name, []) - } - fig = get_column_shapes_plot(score_breakdowns) - - elif property_name == 'Column Pair Trends': - score_breakdowns = { - metric.__name__: self._metric_results[metric.__name__].get(table_name, {}) - for metric in self.METRICS.get(property_name, []) - } - fig = get_column_pairs_plot( - score_breakdowns, - ) - - elif property_name == 'Cardinality' or 'Parent Child Relationships': - if property_name == 'Parent Child Relationships': - property_name = 'Cardinality' - msg = ( - "The 'Parent Child Relationships' property name is no longer recognized. " - "Please update to 'Cardinality' instead." - ) - warnings.warn(msg, FutureWarning) - - score_breakdowns = { - metric.__name__: self._metric_results[metric.__name__] - for metric in self.METRICS.get(property_name, []) - } - if table_name is not None: - for metric, metric_results in score_breakdowns.items(): - score_breakdowns[metric] = { - tables: results for tables, results in metric_results.items() - if table_name in tables - } - - fig = get_table_relationships_plot(score_breakdowns) + self._validate_visualization(property_name, table_name) - return fig + return self._properties_instances[property_name].get_visualization(table_name) def get_details(self, property_name, table_name=None): """Return the details for each score for the given property name. @@ -223,128 +186,34 @@ def get_details(self, property_name, table_name=None): Optionally filter results by table. Returns: - pandas.DataFrame - The score breakdown. + dict: + The details of the scores of a property. """ - tables = [] - columns = [] - metrics = [] - scores = [] - errors = [] - details = pd.DataFrame() - - if property_name == 'Column Shapes': - for metric in self.METRICS[property_name]: - for table, table_breakdown in self._metric_results[metric.__name__].items(): - if table_name is not None and table != table_name: - continue - - for column, score_breakdown in table_breakdown.items(): - if 'score' in score_breakdown and pd.isna(score_breakdown['score']): - continue - tables.append(table) - columns.append(column) - metrics.append(metric.__name__) - scores.append(score_breakdown.get('score', np.nan)) - errors.append(score_breakdown.get('error', np.nan)) - - details = pd.DataFrame({ - 'Table': tables, - 'Column': columns, - 'Metric': metrics, - 'Quality Score': scores, - }).sort_values(by=['Table'], ignore_index=True) - - elif property_name == 'Column Pair Trends': - real_scores = [] - synthetic_scores = [] - for metric in self.METRICS[property_name]: - for table, table_breakdown in self._metric_results[metric.__name__].items(): - if table_name is not None and table != table_name: - continue - - for column_pair, score_breakdown in table_breakdown.items(): - tables.append(table) - columns.append(column_pair) - metrics.append(metric.__name__) - scores.append(score_breakdown.get('score', np.nan)) - real_scores.append(score_breakdown.get('real', np.nan)) - synthetic_scores.append(score_breakdown.get('synthetic', np.nan)) - errors.append(score_breakdown.get('error', np.nan)) - - details = pd.DataFrame({ - 'Table': tables, - 'Column 1': [col1 for col1, _ in columns], - 'Column 2': [col2 for _, col2 in columns], - 'Metric': metrics, - 'Quality Score': scores, - 'Real Correlation': real_scores, - 'Synthetic Correlation': synthetic_scores, - }).sort_values(by=['Table'], ignore_index=True) - - elif property_name == 'Cardinality' or 'Parent Child Relationships': - if property_name == 'Parent Child Relationships': - property_name = 'Cardinality' - msg = ( - "The 'Parent Child Relationships' property name is no longer recognized. " - "Please update to 'Cardinality' instead.") - warnings.warn(msg, FutureWarning) - - child_tables = [] - for metric in self.METRICS[property_name]: - for table_pair, score_breakdown in self._metric_results[metric.__name__].items(): - if table_name is not None and table_name not in table_pair: - continue - - tables.append(table_pair[0]) - child_tables.append(table_pair[1]) - metrics.append(metric.__name__) - scores.append(score_breakdown.get('score', np.nan)) - errors.append(score_breakdown.get('error', np.nan)) - - details = pd.DataFrame({ - 'Child Table': child_tables, - 'Parent Table': tables, - 'Metric': metrics, - 'Quality Score': scores, - }) - - if pd.Series(errors).notna().sum() > 0: - details['Error'] = errors + self._validate_inputs(property_name, table_name) + + property_instance = self._properties_instances[property_name] + if property_name != 'Cardinality': + if table_name: + return property_instance._properties[table_name]._details.copy() + + details = {} + for table_name, property_ in property_instance._properties.items(): + details[table_name] = property_._details + + return details + + # For Cardinality, the details are a dictionary where the keys are tuples (table1, table2). + # If table_name is passed, select only the tuples which contain it. + details = property_instance._details + if table_name: + return { + table_names: detail + for table_names, detail in details.items() + if table_name in table_names + } return details - def get_raw_result(self, metric_name): - """Return the raw result of the given metric name. - - Args: - metric_name (str): - The name of the desired metric. - - Returns: - dict - The raw results - """ - metrics = list(itertools.chain.from_iterable(self.METRICS.values())) - for metric in metrics: - if metric.__name__ == metric_name: - filtered_results = {} - for table_name, table_results in self._metric_results[metric_name].items(): - filtered_results[table_name] = { - key: result for key, result in table_results.items() - if not pd.isna(result['score']) - } - - return [ - { - 'metric': { - 'method': f'{metric.__module__}.{metric.__name__}', - 'parameters': {}, - }, - 'results': filtered_results, - }, - ] - def save(self, filepath): """Save this report instance to the given path using pickle. diff --git a/sdmetrics/reports/single_table/_properties/column_pair_trends.py b/sdmetrics/reports/single_table/_properties/column_pair_trends.py index bc9a29fc..9131e923 100644 --- a/sdmetrics/reports/single_table/_properties/column_pair_trends.py +++ b/sdmetrics/reports/single_table/_properties/column_pair_trends.py @@ -341,13 +341,13 @@ def _get_correlation_matrix(self, column_name): def _get_heatmap(self, correlation_matrix, coloraxis, hovertemplate, customdata=None): """Get the heatmap for the given correlation matrix.""" fig = go.Heatmap( - x=correlation_matrix.columns, - y=correlation_matrix.columns, - z=correlation_matrix, - coloraxis=coloraxis, - customdata=customdata, - hovertemplate=hovertemplate, - ) + x=correlation_matrix.columns, + y=correlation_matrix.columns, + z=correlation_matrix, + coloraxis=coloraxis, + customdata=customdata, + hovertemplate=hovertemplate, + ) return fig @@ -395,9 +395,9 @@ def get_visualization(self): synthetic_correlation = self._get_correlation_matrix('Synthetic Correlation') titles = [ - 'Real vs. Synthetic Similarity', - 'Numerical Correlation (Real Data)', - 'Numerical Correlation (Synthetic Data)', + 'Real vs. Synthetic Similarity', + 'Numerical Correlation (Real Data)', + 'Numerical Correlation (Synthetic Data)', ] specs = [[{'colspan': 2, 'l': 0.26, 'r': 0.26}, None], [{}, {}]] tmpl_1 = 'Column Pair
(%{x},%{y})

Similarity: %{z}' diff --git a/sdmetrics/reports/single_table/quality_report.py b/sdmetrics/reports/single_table/quality_report.py index f747dc71..4d67fc1d 100644 --- a/sdmetrics/reports/single_table/quality_report.py +++ b/sdmetrics/reports/single_table/quality_report.py @@ -76,7 +76,7 @@ def validate(self, real_data, synthetic_data, metadata): def _print_results(self, out=sys.stdout): """Print the quality report results.""" out.write( - f'\nOverall Quality Score: {round(self._overall_quality_score * 100, 2)}%\n\n' + f'\nOverall Quality Score: {round(self._overall_quality_score * 100, 2)}%\n\n' ) out.write('Properties:\n') diff --git a/sdmetrics/warnings.py b/sdmetrics/warnings.py index 641c23cb..1d489cd5 100644 --- a/sdmetrics/warnings.py +++ b/sdmetrics/warnings.py @@ -4,8 +4,6 @@ class SDMetricsWarning(RuntimeWarning): """Class to represent SDMetrics warnings.""" - pass - class ConstantInputWarning(SDMetricsWarning): """Thrown when the input data has all the same values.""" diff --git a/tests/integration/reports/multi_table/test_multi_table_quality_report.py b/tests/integration/reports/multi_table/test_multi_table_quality_report.py index c2509752..bd944df8 100644 --- a/tests/integration/reports/multi_table/test_multi_table_quality_report.py +++ b/tests/integration/reports/multi_table/test_multi_table_quality_report.py @@ -1,5 +1,6 @@ from datetime import date, datetime +import numpy as np import pandas as pd from sdmetrics.reports.multi_table.quality_report import QualityReport @@ -76,14 +77,83 @@ def load_test_data(): def test_multi_table_quality_report(): - """Test the multi table quality report.""" - real_data, synthetic_data, metadata = load_test_data() + """Test the multi table QualityReport. + Run all the public methods for QualityReport, and check that all the scores for + all the properties are correct. + """ + # Setup + real_data, synthetic_data, metadata = load_test_data() report = QualityReport() - report.generate(real_data, synthetic_data, metadata) + # Run `generate`, `get_properties` and `get_score`, + # as well as `get_visualization` and `get_details` for every property: + # 'Column Shapes', 'Column Pair Trends', 'Cardinality' + report.generate(real_data, synthetic_data, metadata) properties = report.get_properties() + score = report.get_score() + visualization, details = [], [] + for property_ in report._properties_instances: + visualization.append(report.get_visualization(property_, 'table1')) + details.append(report.get_details(property_, 'table1')) + + # Run `get_details` for every property without passing a table_name + for property_ in report._properties_instances: + details.append(report.get_details(property_)) + + # Assert score + np.testing.assert_almost_equal(score, .72) pd.testing.assert_frame_equal(properties, pd.DataFrame({ 'Property': ['Column Shapes', 'Column Pair Trends', 'Cardinality'], - 'Score': [0.8, 0.6704734340781349, 0.75], + 'Score': [0.79, 0.62, 0.75], + })) + + # Assert Column Shapes details + pd.testing.assert_frame_equal(details[0], pd.DataFrame({ + 'Column': ['col2', 'col3'], + 'Metric': ['TVComplement', 'TVComplement'], + 'Score': [.75, .75] + })) + + # Assert Column Pair Trends details + pd.testing.assert_frame_equal(details[1], pd.DataFrame({ + 'Column 1': ['col2'], + 'Column 2': ['col3'], + 'Metric': ['ContingencySimilarity'], + 'Score': [.25], + 'Real Correlation': [np.nan], + 'Synthetic Correlation': [np.nan], + })) + + # Assert Cardinality details + assert details[2] == details[5] == {('table1', 'table2'): {'score': 0.75}} + + # Assert Column Shapes details without table_name + pd.testing.assert_frame_equal(details[3]['table1'], pd.DataFrame({ + 'Column': ['col2', 'col3'], + 'Metric': ['TVComplement', 'TVComplement'], + 'Score': [.75, .75] + })) + pd.testing.assert_frame_equal(details[3]['table2'], pd.DataFrame({ + 'Column': ['col4', 'col5', 'col7'], + 'Metric': ['KSComplement', 'KSComplement', 'KSComplement'], + 'Score': [.75, .75, 1] + })) + + # Assert Column Pair Trends details without table_name + pd.testing.assert_frame_equal(details[4]['table1'], pd.DataFrame({ + 'Column 1': ['col2'], + 'Column 2': ['col3'], + 'Metric': ['ContingencySimilarity'], + 'Score': [.25], + 'Real Correlation': [np.nan], + 'Synthetic Correlation': [np.nan], + })) + pd.testing.assert_frame_equal(details[4]['table2'], pd.DataFrame({ + 'Column 1': ['col4', 'col4', 'col5'], + 'Column 2': ['col5', 'col7', 'col7'], + 'Metric': ['CorrelationSimilarity', 'CorrelationSimilarity', 'CorrelationSimilarity'], + 'Score': [0.9901306731066666, 0.9853027960145061, 0.9678805694257717], + 'Real Correlation': [0.946664, 0.966247, 0.862622], + 'Synthetic Correlation': [0.926925, 0.936853, 0.798384], })) diff --git a/tests/unit/reports/multi_table/test_multi_table_quality_report.py b/tests/unit/reports/multi_table/test_multi_table_quality_report.py index 775a1dee..82a6f6b2 100644 --- a/tests/unit/reports/multi_table/test_multi_table_quality_report.py +++ b/tests/unit/reports/multi_table/test_multi_table_quality_report.py @@ -1,14 +1,15 @@ -import contextlib -import io import pickle +import re from unittest.mock import Mock, call, mock_open, patch import numpy as np import pandas as pd import pytest -from sdmetrics.errors import IncomputableMetricError from sdmetrics.reports.multi_table import QualityReport +from sdmetrics.reports.multi_table._properties.cardinality import Cardinality +from sdmetrics.reports.multi_table._properties.column_pair_trends import ColumnPairTrends +from sdmetrics.reports.multi_table._properties.column_shapes import ColumnShapes class TestQualityReport: @@ -22,610 +23,418 @@ def test___init__(self): report = QualityReport() # Assert + assert report._tables == [] assert report._overall_quality_score is None - assert report._metric_results == {} - assert report._property_breakdown == {} + assert report._properties_instances == {} + assert report._properties_scores == {} + assert report._is_generated is False + assert report._package_version is None - @patch('sdmetrics.reports.multi_table.quality_report.discretize_and_apply_metric') - def test_generate(self, mock_discretize_and_apply_metric): - """Test the ``generate`` method. - - Expect that the multi-table metrics are called. + def test__print_results(self): + """Expect that the correct messages are written.""" + # Setup + report = QualityReport() + report._overall_quality_score = 0.8 + report._properties_scores = { + 'Column Shapes': 0.6, + 'Column Pair Trends': 0.8, + 'Cardinality': 0.9 + } + report._property_errors = { + 'Column Shapes': False, + 'Column Pair Trends': False, + 'Cardinality': False, + } + mock_out = Mock() - Setup: - - Mock the expected multi-table metric compute breakdown calls. + # Run + report._print_results(mock_out) - Input: - - Real data. - - Synthetic data. - - Metadata. + # Assert + mock_out.write.assert_has_calls([ + call('\nOverall Quality Score: 80.0%\n\n'), + call('Properties:\n'), + call('Column Shapes: 60.0%\n'), + call('Column Pair Trends: 80.0%\n'), + call('Cardinality: 90.0%\n'), + ]) - Side Effects: - - Expect that each multi table metric's ``compute_breakdown`` methods are called once. - - Expect that the ``_overall_quality_score`` and ``_property_breakdown`` attributes - are populated. - """ + def test__print_results_with_error(self): + """Expect that the correct messages are written when a property errors out.""" # Setup - mock_discretize_and_apply_metric.return_value = {} - real_data = { - 'table1': pd.DataFrame({'col1': [1, 2, 3], 'col2': ['a', 'b', 'c']}), - 'table2': pd.DataFrame({'col1': [1, 1, 1]}), - } - synthetic_data = { - 'table1': pd.DataFrame({'col1': [1, 3, 3], 'col2': ['b', 'b', 'c']}), - 'table2': pd.DataFrame({'col1': [3, 1, 3]}), - } - metadata = { - 'tables': { - 'table1': {}, - 'table2': {}, - }, - } - ks_complement_mock = Mock() - ks_complement_mock.__name__ = 'KSComplement' - ks_complement_mock.compute_breakdown.return_value = { - 'table1': { - 'col1': {'score': 0.1}, - 'col2': {'score': 0.2}, - } + report = QualityReport() + report._overall_quality_score = 0.7 + report._properties_scores = { + 'Column Shapes': 0.6, + 'Column Pair Trends': np.nan, + 'Cardinality': 0.8, } - - tv_complement_mock = Mock() - tv_complement_mock.__name__ = 'TVComplement' - tv_complement_mock.compute_breakdown.return_value = { - 'table1': { - 'col1': {'score': 0.1}, - 'col2': {'score': 0.2}, - } + report._property_errors = { + 'Column Shapes': False, + 'Column Pair Trends': True, + 'Cardinality': False, } + mock_out = Mock() - corr_sim_mock = Mock() - corr_sim_mock.__name__ = 'CorrelationSimilarity' - corr_sim_mock.compute_breakdown.return_value = { - 'table1': { - ('col1', 'col2'): {'score': 0.1}, - ('col2', 'col3'): {'score': 0.2}, - } - } + # Run + report._print_results(mock_out) - cont_sim_mock = Mock() - cont_sim_mock.__name__ = 'ContingencySimilarity' - cont_sim_mock.compute_breakdown.return_value = { - 'table1': { - ('col1', 'col2'): {'score': 0.1}, - ('col2', 'col3'): {'score': 0.2}, - } - } + # Assert + mock_out.write.assert_has_calls([ + call('\nOverall Quality Score: 70.0%\n\n'), + call('Properties:\n'), + call('Column Shapes: 60.0%\n'), + call('Column Pair Trends: Error computing property.\n'), + call('Cardinality: 80.0%\n'), + ]) - cardinality_mock = Mock() - cardinality_mock.__name__ = 'CardinalityShapeSimilarity' - cardinality_mock.compute_breakdown.return_value = { - ('table1', 'table2'): {'score': 1.0}, + def test__print_results_with_all_errors(self): + """Expect that the correct messages are written when overall score is nan.""" + # Setup + report = QualityReport() + report._overall_quality_score = np.nan + report._properties_scores = { + 'Column Shapes': np.nan, + 'Column Pair Trends': np.nan, + 'Cardinality': np.nan } - metrics_mock = { - 'Column Shapes': [ks_complement_mock, tv_complement_mock], - 'Column Pair Trends': [corr_sim_mock, cont_sim_mock], - 'Cardinality': [cardinality_mock], + report._property_errors = { + 'Column Shapes': True, + 'Column Pair Trends': True, + 'Cardinality': True, } + mock_out = Mock() # Run - with patch.object( - QualityReport, - 'METRICS', - metrics_mock, - ): - report = QualityReport() - report.generate(real_data, synthetic_data, metadata) + report._print_results(mock_out) # Assert - ks_complement_mock.compute_breakdown.assert_called_once_with( - real_data, synthetic_data, metadata) - tv_complement_mock.compute_breakdown.assert_called_once_with( - real_data, synthetic_data, metadata) - corr_sim_mock.compute_breakdown.assert_called_once_with( - real_data, synthetic_data, metadata) - cont_sim_mock.compute_breakdown.assert_called_once_with( - real_data, synthetic_data, metadata) - cardinality_mock.compute_breakdown.assert_called_once_with( - real_data, synthetic_data, metadata) - - assert report._overall_quality_score == 0.43333333333333335 - assert report._property_breakdown == { - 'Column Shapes': 0.15000000000000002, - 'Column Pair Trends': 0.15000000000000002, - 'Cardinality': 1.0, - } + mock_out.write.assert_has_calls([ + call('\nOverall Quality Score: Error computing report.\n\n'), + call('Properties:\n'), + call('Column Shapes: Error computing property.\n'), + call('Column Pair Trends: Error computing property.\n'), + call('Cardinality: Error computing property.\n'), + ]) - @patch('sdmetrics.reports.multi_table.quality_report.discretize_and_apply_metric') - def test_generate_verbose_false(self, mock_discretize_and_apply_metric): - """Test the ``generate`` method with silent mode. Expect that nothing is printed.""" - # Setup - mock_discretize_and_apply_metric.return_value = {} + def get_data(self): real_data = { - 'table1': pd.DataFrame({'col1': [1, 2, 3], 'col2': ['a', 'b', 'c']}), - 'table2': pd.DataFrame({'col1': [1, 1, 1]}), + 'table1': pd.DataFrame({'id': [1, 2], 'col': [2, np.nan]}), + 'table2': pd.DataFrame({'id': [1, 2], 'col': ['a', np.nan]}) } - synthetic_data = { - 'table1': pd.DataFrame({'col1': [1, 3, 3], 'col2': ['b', 'b', 'c']}), - 'table2': pd.DataFrame({'col1': [3, 1, 3]}), + synth_data = { + 'table1': pd.DataFrame({'id': [1, 2], 'col': [3, np.nan]}), + 'table2': pd.DataFrame({'id': [1, 2], 'col': ['a', np.nan]}) } metadata = { 'tables': { - 'table1': {}, - 'table2': {}, + 'table1': {'columns': {'id': {'sdtype': 'id'}, 'col': {'sdtype': 'numerical'}}}, + 'table2': {'columns': {'id': {'sdtype': 'id'}, 'col': {'sdtype': 'categorical'}}} }, - } - - ks_complement_mock = Mock() - ks_complement_mock.__name__ = 'KSComplement' - ks_complement_mock.compute_breakdown.return_value = { - 'table1': { - 'col1': {'score': 0.1}, - 'col2': {'score': 0.2}, - } - } + 'relationships': [ + { + 'parent_table_name': 'table1', + 'parent_primary_key': 'id', + 'child_table_name': 'table2', + 'child_foreign_key': 'id' + } + ] + } + + return real_data, synth_data, metadata + + @patch('sdmetrics.reports.multi_table.quality_report.sys.stdout') + @patch('sdmetrics.reports.multi_table.quality_report.tqdm.tqdm') + @patch( + 'sdmetrics.reports.multi_table._properties.column_pair_trends.ColumnPairTrends.get_score', + return_value=.2 + ) + @patch( + 'sdmetrics.reports.multi_table._properties.column_shapes.ColumnShapes.get_score', + return_value=.4 + ) + @patch( + 'sdmetrics.reports.multi_table._properties.cardinality.Cardinality.get_score', + return_value=.9 + ) + def test_generate(self, mock_cardinality_score, mock_column_shapes_score, + mock_column_pair_trends_score, mock_tqdm, mock_sys): + """Test the proper attributes are set and the progress bar runs correctly.""" + # Setup + real_data, synth_data, metadata = self.get_data() + report = QualityReport() + report._print_results = Mock() - tv_complement_mock = Mock() - tv_complement_mock.__name__ = 'TVComplement' - tv_complement_mock.compute_breakdown.return_value = { - 'table1': { - 'col1': {'score': 0.1}, - 'col2': {'score': 0.2}, - } - } + # Run + report.generate(real_data, synth_data, metadata) - corr_sim_mock = Mock() - corr_sim_mock.__name__ = 'CorrelationSimilarity' - corr_sim_mock.compute_breakdown.return_value = { - 'table1': { - ('col1', 'col2'): {'score': 0.1}, - ('col2', 'col3'): {'score': 0.2}, - } - } + # Assert + assert isinstance(report._properties_instances['Column Shapes'], ColumnShapes) + assert isinstance(report._properties_instances['Column Pair Trends'], ColumnPairTrends) + assert isinstance(report._properties_instances['Cardinality'], Cardinality) + + assert report._properties_scores['Column Shapes'] == .4 + assert report._properties_scores['Column Pair Trends'] == .2 + assert report._properties_scores['Cardinality'] == .9 + + assert report._overall_quality_score == .5 + assert report._is_generated is True + + mock_tqdm.assert_has_calls([ + call(total=4, file=mock_sys), + call().set_description('(1/3) Evaluating Column Shapes: '), + call().close(), + call(total=2, file=mock_sys), + call().set_description('(2/3) Evaluating Column Pair Trends: '), + call().close(), + call(total=1, file=mock_sys), + call().set_description('(3/3) Evaluating Cardinality: '), + call().close() + ]) - cont_sim_mock = Mock() - cont_sim_mock.__name__ = 'ContingencySimilarity' - cont_sim_mock.compute_breakdown.return_value = { - 'table1': { - ('col1', 'col2'): {'score': 0.1}, - ('col2', 'col3'): {'score': 0.2}, - } - } + report._print_results.assert_called_once_with(mock_sys) - cardinality_mock = Mock() - cardinality_mock.__name__ = 'CardinalityShapeSimilarity' - cardinality_mock.compute_breakdown.return_value = { - ('table1', 'table2'): {'score': 1.0}, - } - metrics_mock = { - 'Column Shapes': [ks_complement_mock, tv_complement_mock], - 'Column Pair Trends': [corr_sim_mock, cont_sim_mock], - 'Cardinality': [cardinality_mock], - } + @patch( + 'sdmetrics.reports.multi_table._properties.column_pair_trends.ColumnPairTrends.get_score') + @patch('sdmetrics.reports.multi_table._properties.column_shapes.ColumnShapes.get_score') + @patch('sdmetrics.reports.multi_table._properties.cardinality.Cardinality.get_score') + def test_generate_failed_scores(self, mock_cardinality_score, mock_column_shapes_score, + mock_column_pair_trends_score): + """Test the ``generate`` method when `get_score` for each property fails.""" + # Setup + real_data, synth_data, metadata = self.get_data() + report = QualityReport() + mock_cardinality_score.side_effect = ValueError + mock_column_shapes_score.side_effect = ValueError + mock_column_pair_trends_score.side_effect = ValueError # Run - prints = io.StringIO() - with contextlib.redirect_stderr(prints), patch.object( - QualityReport, - 'METRICS', - metrics_mock, - ): - report = QualityReport() - report.generate(real_data, synthetic_data, metadata, verbose=False) + report.generate(real_data, synth_data, metadata) # Assert - ks_complement_mock.compute_breakdown.assert_called_once_with( - real_data, synthetic_data, metadata) - tv_complement_mock.compute_breakdown.assert_called_once_with( - real_data, synthetic_data, metadata) - corr_sim_mock.compute_breakdown.assert_called_once_with( - real_data, synthetic_data, metadata) - cont_sim_mock.compute_breakdown.assert_called_once_with( - real_data, synthetic_data, metadata) - cardinality_mock.compute_breakdown.assert_called_once_with( - real_data, synthetic_data, metadata) - - assert report._overall_quality_score == 0.43333333333333335 - assert report._property_breakdown == { - 'Column Shapes': 0.15000000000000002, - 'Column Pair Trends': 0.15000000000000002, - 'Cardinality': 1.0, - } - assert prints.getvalue() == '' - - @patch('sdmetrics.reports.multi_table.quality_report.discretize_and_apply_metric') - def test_generate_with_errored_metric(self, mock_discretize_and_apply_metric): - """Test the ``generate`` method with an errored metric. - - Expect that the multi-table metrics are called. Expect that the aggregate scores - are computed without the errored metric. + assert pd.isna(report._properties_scores['Column Shapes']) + assert pd.isna(report._properties_scores['Column Pair Trends']) + assert pd.isna(report._properties_scores['Cardinality']) - Setup: - - Mock the expected multi-table metric compute breakdown calls. + assert report._property_errors['Column Shapes'] is True + assert report._property_errors['Column Pair Trends'] is True + assert report._property_errors['Cardinality'] is True - Input: - - Real data. - - Synthetic data. - - Metadata. + assert pd.isna(report._overall_quality_score) - Side Effects: - - Expect that each multi table metric's ``compute_breakdown`` methods are called once. - - Expect that the ``_overall_quality_score`` and ``_property_breakdown`` attributes - are populated. - """ + def test_get_score(self): + """Test the ``get_score`` method.""" # Setup - mock_discretize_and_apply_metric.return_value = {} - real_data = { - 'table1': pd.DataFrame({'col1': [1, 2, 3], 'col2': ['a', 'b', 'c']}), - 'table2': pd.DataFrame({'col1': [1, 1, 1]}), - } - synthetic_data = { - 'table1': pd.DataFrame({'col1': [1, 3, 3], 'col2': ['b', 'b', 'c']}), - 'table2': pd.DataFrame({'col1': [3, 1, 3]}), - } - metadata = { - 'tables': { - 'table1': {}, - 'table2': {}, - }, - } + report = QualityReport() + mock_score = Mock() + report._overall_quality_score = mock_score + report._is_generated = True - ks_complement_mock = Mock() - ks_complement_mock.__name__ = 'KSComplement' - ks_complement_mock.compute_breakdown.return_value = { - 'table1': { - 'col1': {'score': 0.1}, - 'col2': {'error': 'test error'}, - } - } + # Run + score = report.get_score() - tv_complement_mock = Mock() - tv_complement_mock.__name__ = 'TVComplement' - tv_complement_mock.compute_breakdown.return_value = { - 'table1': { - 'col1': {'score': 0.1}, - 'col2': {'score': 0.2}, - } - } + # Assert + assert score == mock_score - corr_sim_mock = Mock() - corr_sim_mock.__name__ = 'CorrelationSimilarity' - corr_sim_mock.compute_breakdown.return_value = { - 'table1': { - ('col1', 'col2'): {'score': 0.1}, - ('col2', 'col3'): {'score': 0.2}, - } - } + def test_get_score_not_generated(self): + """Test the ``get_score`` method when the report hasn't been generated.""" + # Setup + report = QualityReport() - cont_sim_mock = Mock() - cont_sim_mock.__name__ = 'ContingencySimilarity' - cont_sim_mock.compute_breakdown.return_value = { - 'table1': { - ('col1', 'col2'): {'score': 0.1}, - ('col2', 'col3'): {'score': 0.2}, - } - } + # Run and Assert + msg = "The report has not been generated yet. Please call the 'generate' method." + with pytest.raises(ValueError, match=msg): + report.get_score() - cardinality_mock = Mock() - cardinality_mock.__name__ = 'CardinalityShapeSimilarity' - cardinality_mock.compute_breakdown.return_value = { - ('table1', 'table2'): {'score': 1.0}, - } - metrics_mock = { - 'Column Shapes': [ks_complement_mock, tv_complement_mock], - 'Column Pair Trends': [corr_sim_mock, cont_sim_mock], - 'Cardinality': [cardinality_mock], + def test_get_properties(self): + """Test the ``get_properties`` method.""" + # Setup + report = QualityReport() + report._properties_scores = { + 'Column Shapes': 0.1, + 'Column Pair Trends': 0.2, + 'Cardinality': 0.3, } + report._is_generated = True # Run - with patch.object( - QualityReport, - 'METRICS', - metrics_mock, - ): - report = QualityReport() - report.generate(real_data, synthetic_data, metadata) + properties = report.get_properties() # Assert - ks_complement_mock.compute_breakdown.assert_called_once_with( - real_data, synthetic_data, metadata) - tv_complement_mock.compute_breakdown.assert_called_once_with( - real_data, synthetic_data, metadata) - corr_sim_mock.compute_breakdown.assert_called_once_with( - real_data, synthetic_data, metadata) - cont_sim_mock.compute_breakdown.assert_called_once_with( - real_data, synthetic_data, metadata) - cardinality_mock.compute_breakdown.assert_called_once_with( - real_data, synthetic_data, metadata) - - assert report._overall_quality_score == 0.42777777777777776 - assert report._property_breakdown == { - 'Column Shapes': 0.13333333333333333, - 'Column Pair Trends': 0.15000000000000002, - 'Cardinality': 1.0, - } - assert report._property_errors == { - 'Column Shapes': 1, - 'Column Pair Trends': 0, - 'Cardinality': 0, - } - - @patch('sdmetrics.reports.multi_table.quality_report.discretize_and_apply_metric') - def test_generate_single_table(self, mock_discretize_and_apply_metric): - """Test the ``generate`` method when there's only one table. - - Expect that the multi-table metrics are called. Expect that the parent-child - property score is NaN. - - Setup: - - Mock the expected multi-table metric compute breakdown calls. - - Input: - - Real data. - - Synthetic data. - - Metadata. + pd.testing.assert_frame_equal( + properties, + pd.DataFrame({ + 'Property': ['Column Shapes', 'Column Pair Trends', 'Cardinality'], + 'Score': [0.1, 0.2, 0.3], + }), + ) - Side Effects: - - Expect that each multi table metric's ``compute_breakdown`` methods are called once. - - Expect that the ``_overall_quality_score`` and ``_property_breakdown`` attributes - are populated. - """ + def test_get_properties_not_generated(self): + """Test the ``get_properties`` method when the report hasn't been generated.""" # Setup - mock_discretize_and_apply_metric.return_value = {} - real_data = { - 'table1': pd.DataFrame({'col1': [1, 2, 3], 'col2': ['a', 'b', 'c']}), - 'table2': pd.DataFrame({'col1': [1, 1, 1]}), - } - synthetic_data = { - 'table1': pd.DataFrame({'col1': [1, 3, 3], 'col2': ['b', 'b', 'c']}), - 'table2': pd.DataFrame({'col1': [3, 1, 3]}), - } - metadata = { - 'tables': { - 'table1': {}, - 'table2': {}, - }, - } + report = QualityReport() - ks_complement_mock = Mock() - ks_complement_mock.__name__ = 'KSComplement' - ks_complement_mock.compute_breakdown.return_value = { - 'table1': { - 'col1': {'score': 0.1}, - 'col2': {'score': 0.2}, - } - } + # Run and Assert + msg = "The report has not been generated yet. Please call the 'generate' method." + with pytest.raises(ValueError, match=msg): + report.get_properties() - tv_complement_mock = Mock() - tv_complement_mock.__name__ = 'TVComplement' - tv_complement_mock.compute_breakdown.return_value = { - 'table1': { - 'col1': {'score': 0.1}, - 'col2': {'score': 0.2}, - } - } + def test_get_visualization(self): + """Test the ``get_vizualization`` method.""" + # Setup + report = QualityReport() + instance = Mock() + instance.get_visualization = Mock(return_value='visualization') + report._properties_instances = {'Cardinality': instance} + report._is_generated = True - corr_sim_mock = Mock() - corr_sim_mock.__name__ = 'CorrelationSimilarity' - corr_sim_mock.compute_breakdown.return_value = { - 'table1': { - ('col1', 'col2'): {'score': 0.1}, - ('col2', 'col3'): {'score': 0.2}, - } - } + # Run + visualization = report.get_visualization('Cardinality') - cont_sim_mock = Mock() - cont_sim_mock.__name__ = 'ContingencySimilarity' - cont_sim_mock.compute_breakdown.return_value = { - 'table1': { - ('col1', 'col2'): {'score': 0.1}, - ('col2', 'col3'): {'score': 0.2}, - } - } + # Assert + instance.get_visualization.assert_called_once_with(None) + assert visualization == 'visualization' - cardinality_mock = Mock() - cardinality_mock.__name__ = 'CardinalityShapeSimilarity' - cardinality_mock.compute_breakdown.return_value = { - 'score': np.nan, - } - metrics_mock = { - 'Column Shapes': [ks_complement_mock, tv_complement_mock], - 'Column Pair Trends': [corr_sim_mock, cont_sim_mock], - 'Cardinality': [cardinality_mock], - } + def test_get_visualization_not_generated(self): + """Test the ``get_visualization`` method when the report hasn't been generated.""" + # Setup + report = QualityReport() - # Run - with patch.object( - QualityReport, - 'METRICS', - metrics_mock, - ): - report = QualityReport() - report.generate(real_data, synthetic_data, metadata) + # Run and Assert + msg = "The report has not been generated yet. Please call the 'generate' method." + with pytest.raises(ValueError, match=msg): + report.get_visualization('property_name') - # Assert - ks_complement_mock.compute_breakdown.assert_called_once_with( - real_data, synthetic_data, metadata) - tv_complement_mock.compute_breakdown.assert_called_once_with( - real_data, synthetic_data, metadata) - corr_sim_mock.compute_breakdown.assert_called_once_with( - real_data, synthetic_data, metadata) - cont_sim_mock.compute_breakdown.assert_called_once_with( - real_data, synthetic_data, metadata) - cardinality_mock.compute_breakdown.assert_called_once_with( - real_data, synthetic_data, metadata) - - assert report._overall_quality_score == 0.15000000000000002 - assert np.isnan(report._property_breakdown['Cardinality']) - - @patch('sdmetrics.reports.multi_table.quality_report.discretize_and_apply_metric') - def test_generate_with_non_applicable_metric(self, mock_discretize_and_apply_metric): - """Test the ``generate`` method with a non applicable metric. - - Expect that the multi-table metrics are called. Expect that if a metric is not - applicable, it is skipped. - - Setup: - - Mock the expected multi-table metric compute breakdown calls. - - Mock one metric to return an ``IncomputableMetricError``. + def test_get_visualization_invalid_property(self): + """Test it when the given property_name doesn't exist.""" + # Setup + report = QualityReport() + report._is_generated = True + report._properties_instances = {'property_name': None} - Input: - - Real data. - - Synthetic data. - - Metadata. + # Run and Assert + msg = re.escape( + "Invalid property name ('invalid_name'). It must be one of " + "['property_name']." + ) + with pytest.raises(ValueError, match=msg): + report.get_visualization('invalid_name') - Side Effects: - - Expect that each multi table metric's ``compute_breakdown`` methods are called once. - - Expect that the ``_overall_quality_score`` and ``_property_breakdown`` attributes - are populated. - """ + def test_get_visualization_missing_table_name(self): + """Test it when table_name is missing and property is not Cardinality.""" # Setup - mock_discretize_and_apply_metric.return_value = {} - real_data = { - 'table1': pd.DataFrame({'col1': [1, 2, 3], 'col2': ['a', 'b', 'c']}), - 'table2': pd.DataFrame({'col1': [1, 1, 1]}), - } - synthetic_data = { - 'table1': pd.DataFrame({'col1': [1, 3, 3], 'col2': ['b', 'b', 'c']}), - 'table2': pd.DataFrame({'col1': [3, 1, 3]}), - } - metadata = { - 'tables': { - 'table1': {}, - 'table2': {}, - }, - } + report = QualityReport() + report._is_generated = True + report._properties_instances = {'Column Shapes': None} + report._tables = ['tab1'] - ks_complement_mock = Mock() - ks_complement_mock.__name__ = 'KSComplement' - ks_complement_mock.compute_breakdown.return_value = { - 'table1': { - 'col1': {'score': 0.1}, - 'col2': {'score': 0.2}, - } - } + # Run and Assert + msg = "Table name must be provided when viewing details for property 'Column Shapes'." + with pytest.raises(ValueError, match=msg): + report.get_visualization('Column Shapes') - tv_complement_mock = Mock() - tv_complement_mock.__name__ = 'TVComplement' - tv_complement_mock.compute_breakdown.side_effect = IncomputableMetricError() - - corr_sim_mock = Mock() - corr_sim_mock.__name__ = 'CorrelationSimilarity' - corr_sim_mock.compute_breakdown.return_value = { - 'table1': { - ('col1', 'col2'): {'score': 0.1}, - ('col2', 'col3'): {'score': 0.2}, - } - } + def test_get_visualization_invalid_table_name(self): + """Test it when table_name is invalid.""" + # Setup + report = QualityReport() + report._is_generated = True + report._properties_instances = {'Column Shapes': None} + report._tables = ['table'] - cont_sim_mock = Mock() - cont_sim_mock.__name__ = 'ContingencySimilarity' - cont_sim_mock.compute_breakdown.return_value = { - 'table1': { - ('col1', 'col2'): {'score': 0.1}, - ('col2', 'col3'): {'score': 0.2}, - } - } + # Run and Assert + msg = re.escape("Unknown table ('invalid_table'). Must be one of ['table'].") + with pytest.raises(ValueError, match=msg): + report.get_visualization('Column Shapes', 'invalid_table') - cardinality_mock = Mock() - cardinality_mock.__name__ = 'CardinalityShapeSimilarity' - cardinality_mock.compute_breakdown.return_value = { - ('table1', 'table2'): {'score': 1.0}, - } - metrics_mock = { - 'Column Shapes': [ks_complement_mock, tv_complement_mock], - 'Column Pair Trends': [corr_sim_mock, cont_sim_mock], - 'Cardinality': [cardinality_mock], - } + def test_get_details(self): + """Test the ``get_details`` method.""" + # Setup + report = QualityReport() + instance = Mock() + instance._details = {'details'} + report._properties_instances = {'Cardinality': instance} + report._is_generated = True # Run - with patch.object( - QualityReport, - 'METRICS', - metrics_mock, - ): - report = QualityReport() - report.generate(real_data, synthetic_data, metadata) + details = report.get_details('Cardinality') # Assert - ks_complement_mock.compute_breakdown.assert_called_once_with( - real_data, synthetic_data, metadata) - tv_complement_mock.compute_breakdown.assert_called_once_with( - real_data, synthetic_data, metadata) - corr_sim_mock.compute_breakdown.assert_called_once_with( - real_data, synthetic_data, metadata) - cont_sim_mock.compute_breakdown.assert_called_once_with( - real_data, synthetic_data, metadata) - cardinality_mock.compute_breakdown.assert_called_once_with( - real_data, synthetic_data, metadata) - - assert report._overall_quality_score == 0.43333333333333335 - assert report._property_breakdown == { - 'Column Shapes': 0.15000000000000002, - 'Column Pair Trends': 0.15000000000000002, - 'Cardinality': 1.0, - } - assert report._metric_results['TVComplement'] == {} + assert details == {'details'} - def test_get_score(self): - """Test the ``get_score`` method. - - Expect that the overall quality score is returned. + def test_get_details_table_name(self): + """Test the ``get_details`` method with Cardinality and table_name.""" + # Setup + report = QualityReport() + instance = Mock() + instance._details = { + ('table1', 'table2'): {'score': 0.75}, + ('table1', 'table3'): {'score': 0.57} + } + report._properties_instances = {'Cardinality': instance} + report._is_generated = True + report._tables = ['table1', 'table2', 'table3'] - Setup: - - Mock the ``_overall_quality_score`` attribute. + # Run + details = report.get_details('Cardinality', 'table3') - Input: - - None + # Assert + assert details == {('table1', 'table3'): {'score': 0.57}} - Output: - - The overall quality score. - """ + def test_get_details_no_table_name(self): + """Test it works when table_name is None and property is not Cardinality.""" # Setup report = QualityReport() - mock_score = Mock() - report._overall_quality_score = mock_score + report._is_generated = True + instance = Mock() + details_mock = Mock() + details_mock._details = {'details'} + instance._properties = {'table': details_mock} + report._properties_instances = {'Column Shapes': instance} + report._tables = ['tab1'] # Run - score = report.get_score() + details = report.get_details('Column Shapes') # Assert - assert score == mock_score + assert details == {'table': {'details'}} - def test_get_properties(self): - """Test the ``get_properties`` method. + def test_get_details_not_generated(self): + """Test the ``get_details`` method when the report hasn't been generated.""" + # Setup + report = QualityReport() - Expect that the property score breakdown is returned. + # Run and Assert + msg = "The report has not been generated yet. Please call the 'generate' method." + with pytest.raises(ValueError, match=msg): + report.get_details('property_name') - Setup: - - Mock the ``_property_breakdown`` attribute. + def test_get_details_invalid_property(self): + """Test it when the given property_name doesn't exist.""" + # Setup + report = QualityReport() + report._is_generated = True + report._properties_instances = {'property_name': None} - Input: - - None + # Run and Assert + msg = re.escape( + "Invalid property name ('invalid_name'). It must be one of " + "['property_name']." + ) + with pytest.raises(ValueError, match=msg): + report.get_details('invalid_name') - Output: - - The metric scores for each property. - """ + def test_get_details_invalid_table_name(self): + """Test it when table_name is invalid.""" # Setup report = QualityReport() - mock_property_breakdown = { - 'Column Shapes': 0.1, - 'Column Pair Trends': 0.2, - 'Cardinality': 0.3, - } - report._property_breakdown = mock_property_breakdown - - # Run - properties = report.get_properties() + report._is_generated = True + report._properties_instances = {'Column Shapes': None} + report._tables = ['table'] - # Assert - pd.testing.assert_frame_equal( - properties, - pd.DataFrame({ - 'Property': ['Column Shapes', 'Column Pair Trends', 'Cardinality'], - 'Score': [0.1, 0.2, 0.3], - }), - ) + # Run and Assert + msg = re.escape("Unknown table ('invalid_table'). Must be one of ['table'].") + with pytest.raises(ValueError, match=msg): + report.get_details('Column Shapes', 'invalid_table') @patch('sdmetrics.reports.multi_table.quality_report.pkg_resources.get_distribution') @patch('sdmetrics.reports.multi_table.quality_report.pickle') @@ -720,593 +529,3 @@ def test_load_mismatched_versions(self, pickle_mock, get_distribution_mock, warn 'currently using version `new_version`. Some features may not work as intended.' ) assert loaded == pickle_mock.load.return_value - - @patch('sdmetrics.reports.multi_table.quality_report.get_column_shapes_plot') - def test_get_visualization_column_shapes(self, get_plot_mock): - """Test the ``get_visualization`` method with Column Shapes. - - Input: - - property='Column Shapes' - - Output: - - visualization - - Side Effects: - - get_column_shapes_plot is called with the expected score breakdowns. - """ - # Setup - report = QualityReport() - report._metric_results['KSComplement'] = { - 'table1': {'col1': {'score': 'ks_complement_score'}}, - 'table2': {'col3': {'score': 'other_score'}}, - } - report._metric_results['TVComplement'] = { - 'table1': {'col2': {'score': 'tv_complement_score'}}, - 'table2': {'col4': {'score': 'other_score'}}, - } - - # Run - out = report.get_visualization('Column Shapes', table_name='table1') - - # Assert - get_plot_mock.assert_called_once_with({ - 'KSComplement': {'col1': {'score': 'ks_complement_score'}}, - 'TVComplement': {'col2': {'score': 'tv_complement_score'}}, - }) - assert out == get_plot_mock.return_value - - def test_get_visualization_column_shapes_no_table_name(self): - """Test the ``get_visualization`` method with Column Shapes and no table name. - - Expect that a ``ValueError`` is thrown. - - Input: - - property='Column Shapes' - - no table_name - - Side Effects: - - a ``ValueError`` is thrown. - """ - # Setup - report = QualityReport() - - # Run and assert - with pytest.raises( - ValueError, - match='Table name must be provided when viewing details for property Column Shapes', - ): - report.get_visualization('Column Shapes') - - @patch('sdmetrics.reports.multi_table.quality_report.get_column_pairs_plot') - def test_get_visualization_column_pairs(self, get_plot_mock): - """Test the ``get_visualization`` method with Column Pairs. - - Input: - - property='Column Pairs' - - Output: - - visualization - - Side Effects: - - get_column_pairs_plot is called with the expected score breakdowns. - """ - # Setup - report = QualityReport() - report._metric_results['CorrelationSimilarity'] = { - 'table1': {('col1', 'col2'): {'score': 'test_score_1'}}, - 'table2': {('col3', 'col4'): {'score': 'other_score'}}, - } - report._metric_results['ContingencySimilarity'] = { - 'table1': {('col5', 'col6'): {'score': 'test_score_2'}}, - 'table2': {('col7', 'col8'): {'score': 'other_score'}}, - } - - # Run - out = report.get_visualization('Column Pair Trends', table_name='table1') - - # Assert - get_plot_mock.assert_called_once_with({ - 'CorrelationSimilarity': {('col1', 'col2'): {'score': 'test_score_1'}}, - 'ContingencySimilarity': {('col5', 'col6'): {'score': 'test_score_2'}}, - }) - assert out == get_plot_mock.return_value - - @patch('sdmetrics.reports.multi_table.quality_report.get_table_relationships_plot') - def test_get_visualization_table_relationships(self, get_plot_mock): - """Test the ``get_visualization`` method with Cardinality. - - Input: - - property='Cardinality' - - Output: - - visualization - - Side Effects: - - get_parent_child_relationships_plot is called with the expected score breakdowns. - """ - # Setup - report = QualityReport() - report._metric_results['CardinalityShapeSimilarity'] = { - ('table1', 'table2'): {'score': 'test_score_1'}, - ('table3', 'table2'): {'score': 'test_score_2'}, - } - - # Run - out = report.get_visualization('Cardinality') - - # Assert - get_plot_mock.assert_called_once_with({ - 'CardinalityShapeSimilarity': { - ('table1', 'table2'): {'score': 'test_score_1'}, - ('table3', 'table2'): {'score': 'test_score_2'}, - }, - }) - assert out == get_plot_mock.return_value - - @patch('sdmetrics.reports.multi_table.quality_report.get_table_relationships_plot') - def test_get_visualization_table_relationships_with_table_name(self, get_plot_mock): - """Test the ``get_visualization`` method with Cardinality and a table. - - Input: - - property='Cardinality' - - table name - - Output: - - visualization - - Side Effects: - - get_parent_child_relationships_plot is called with the expected score breakdowns. - """ - # Setup - report = QualityReport() - report._metric_results['CardinalityShapeSimilarity'] = { - ('table1', 'table2'): {'score': 'test_score_1'}, - ('table3', 'table2'): {'score': 'test_score_2'}, - } - - # Run - out = report.get_visualization('Cardinality', table_name='table1') - - # Assert - get_plot_mock.assert_called_once_with({ - 'CardinalityShapeSimilarity': { - ('table1', 'table2'): {'score': 'test_score_1'}, - }, - }) - assert out == get_plot_mock.return_value - - def test_get_details_column_shapes(self): - """Test the ``get_details`` method with column shapes. - - Expect that the details of the desired property is returned. - - Input: - - property name - - Output: - - score details for the desired property - """ - # Setup - report = QualityReport() - report._metric_results = { - 'KSComplement': { - 'table1': { - 'col1': {'score': 0.1}, - 'col2': {'score': 0.2}, - }, - 'table2': { - 'col3': {'score': 0.5}, - }, - }, - 'TVComplement': { - 'table1': { - 'col1': {'score': 0.3}, - 'col2': {'score': 0.4}, - }, - 'table2': { - 'col3': {'score': 0.6}, - }, - } - } - - # Run - out = report.get_details('Column Shapes') - - # Assert - pd.testing.assert_frame_equal( - out, - pd.DataFrame({ - 'Table': ['table1', 'table1', 'table1', 'table1', 'table2', 'table2'], - 'Column': ['col1', 'col2', 'col1', 'col2', 'col3', 'col3'], - 'Metric': [ - 'KSComplement', - 'KSComplement', - 'TVComplement', - 'TVComplement', - 'KSComplement', - 'TVComplement', - ], - 'Quality Score': [0.1, 0.2, 0.3, 0.4, 0.5, 0.6], - }) - ) - - def test_get_details_column_shapes_with_error(self): - """Test the ``get_details`` method with column shapes with error. - - Expect that the details of the desired property is returned, and that the details - contains an Error column. - - Input: - - property name - - Output: - - score details for the desired property - """ - # Setup - report = QualityReport() - report._metric_results = { - 'KSComplement': { - 'table1': { - 'col1': {'score': 0.1}, - 'col2': {'error': 'test error'}, - }, - 'table2': { - 'col3': {'score': 0.5}, - }, - }, - 'TVComplement': { - 'table1': { - 'col1': {'score': 0.3}, - 'col2': {'score': 0.4}, - }, - 'table2': { - 'col3': {'score': 0.6}, - }, - } - } - - # Run - out = report.get_details('Column Shapes') - - # Assert - pd.testing.assert_frame_equal( - out, - pd.DataFrame({ - 'Table': ['table1', 'table1', 'table1', 'table1', 'table2', 'table2'], - 'Column': ['col1', 'col2', 'col1', 'col2', 'col3', 'col3'], - 'Metric': [ - 'KSComplement', - 'KSComplement', - 'TVComplement', - 'TVComplement', - 'KSComplement', - 'TVComplement', - ], - 'Quality Score': [0.1, np.nan, 0.3, 0.4, 0.5, 0.6], - 'Error': [np.nan, 'test error', np.nan, np.nan, np.nan, np.nan], - }) - ) - - def test_get_details_column_pair_trends(self): - """Test the ``get_details`` method with column pair trends. - - Expect that the details of the desired property is returned. - - Input: - - property name - - Output: - - score details for the desired property - """ - # Setup - report = QualityReport() - report._metric_results = { - 'CorrelationSimilarity': { - 'table1': { - ('col1', 'col3'): {'score': 0.1, 'real': 0.1, 'synthetic': 0.1}, - ('col2', 'col4'): {'score': 0.2, 'real': 0.2, 'synthetic': 0.2}, - }, - }, - 'ContingencySimilarity': { - 'table1': { - ('col1', 'col3'): {'score': 0.3, 'real': 0.3, 'synthetic': 0.3}, - ('col2', 'col4'): {'score': 0.4, 'real': 0.4, 'synthetic': 0.4}, - }, - } - } - - # Run - out = report.get_details('Column Pair Trends') - - # Assert - pd.testing.assert_frame_equal( - out, - pd.DataFrame({ - 'Table': ['table1', 'table1', 'table1', 'table1'], - 'Column 1': ['col1', 'col2', 'col1', 'col2'], - 'Column 2': ['col3', 'col4', 'col3', 'col4'], - 'Metric': [ - 'CorrelationSimilarity', - 'CorrelationSimilarity', - 'ContingencySimilarity', - 'ContingencySimilarity', - ], - 'Quality Score': [0.1, 0.2, 0.3, 0.4], - 'Real Correlation': [0.1, 0.2, 0.3, 0.4], - 'Synthetic Correlation': [0.1, 0.2, 0.3, 0.4], - }) - ) - - def test_get_details_cardinality(self): - """Test the ``get_details`` method with Cardinality. - - Expect that the details of the desired property is returned. - - Input: - - property name - - Output: - - score details for the desired property - """ - # Setup - report = QualityReport() - report._metric_results = { - 'CardinalityShapeSimilarity': { - ('table1', 'table2'): {'score': 0.1}, - ('table1', 'table3'): {'score': 0.2}, - }, - } - warning_msg = ( - "The 'Parent Child Relationships' property name is no longer recognized. Please " - "update to 'Cardinality' instead." - ) - - # Run - out = report.get_details('Cardinality') - with pytest.warns(FutureWarning, match=warning_msg): - out_parent_child_relationships = report.get_details('Parent Child Relationships') - - # Assert - pd.testing.assert_frame_equal( - out, - pd.DataFrame({ - 'Child Table': ['table2', 'table3'], - 'Parent Table': ['table1', 'table1'], - 'Metric': ['CardinalityShapeSimilarity', 'CardinalityShapeSimilarity'], - 'Quality Score': [0.1, 0.2], - }) - ) - pd.testing.assert_frame_equal(out, out_parent_child_relationships) - - def test_get_details_cardinality_with_error(self): - """Test the ``get_details`` method with Cardinality with error. - - Expect that the details of the desired property is returned, and that the - details contains an Error column. - - Input: - - property name - - Output: - - score details for the desired property - """ - # Setup - report = QualityReport() - report._metric_results = { - 'CardinalityShapeSimilarity': { - ('table1', 'table2'): {'score': 0.1}, - ('table1', 'table3'): {'score': 0.2}, - ('table1', 'table4'): {'error': 'test error'}, - }, - } - - # Run - out = report.get_details('Cardinality') - - # Assert - pd.testing.assert_frame_equal( - out, - pd.DataFrame({ - 'Child Table': ['table2', 'table3', 'table4'], - 'Parent Table': ['table1', 'table1', 'table1'], - 'Metric': [ - 'CardinalityShapeSimilarity', - 'CardinalityShapeSimilarity', - 'CardinalityShapeSimilarity', - ], - 'Quality Score': [0.1, 0.2, np.nan], - 'Error': [np.nan, np.nan, 'test error'], - }) - ) - - def test_get_details_parent_child_relationships_with_table_name(self): - """Test the ``get_details`` method with Cardinality with a table filter. - - Expect that the details of the desired property is returned. - - Input: - - property name - - table name - - Output: - - score details for the desired property - """ - # Setup - report = QualityReport() - report._metric_results = { - 'CardinalityShapeSimilarity': { - ('table1', 'table2'): {'score': 0.1}, - ('table1', 'table3'): {'score': 0.2}, - }, - } - - # Run - out = report.get_details('Cardinality', table_name='table2') - - # Assert - pd.testing.assert_frame_equal( - out, - pd.DataFrame({ - 'Child Table': ['table2'], - 'Parent Table': ['table1'], - 'Metric': ['CardinalityShapeSimilarity'], - 'Quality Score': [0.1], - }) - ) - - def test_get_raw_result(self): - """Test the ``get_raw_result`` method. - - Expect that the raw result of the desired metric is returned. Expect that null scores - are excluded. - - Input: - - metric name - - Output: - - Metric details - """ - # Setup - report = QualityReport() - report._metric_results = { - 'KSComplement': { - 'table1': { - 'col1': {'score': 0.1}, - 'col2': {'score': 0.2}, - 'col3': {'score': np.nan}, - }, - }, - } - - # Run - out = report.get_raw_result('KSComplement') - - # Assert - assert out == [ - { - 'metric': { - 'method': 'sdmetrics.multi_table.multi_single_table.KSComplement', - 'parameters': {}, - }, - 'results': { - 'table1': { - 'col1': {'score': 0.1}, - 'col2': {'score': 0.2}, - } - } - } - ] - - def test__print_result(self): - """Test the ``_print_results`` method. - - Expect that the correct messages are written. - - Input: - - out argument - - Side Effects: - - messages are written to the output. - """ - # Setup - report = QualityReport() - report._overall_quality_score = 0.8 - report._property_breakdown = { - 'Column Shapes': 0.6, - 'Column Pair Trends': 0.8, - 'Cardinality': 0.9 - } - report._property_errors = { - 'Column Shapes': 0, - 'Column Pair Trends': 0, - 'Cardinality': 0, - } - mock_out = Mock() - - # Run - report._print_results(mock_out) - - # Assert - mock_out.write.assert_has_calls([ - call('\nOverall Quality Score: 80.0%\n\n'), - call('Properties:\n'), - call('Column Shapes: 60.0%\n'), - call('Column Pair Trends: 80.0%\n'), - call('Cardinality: 90.0%\n'), - ]) - - def test__print_result_with_error(self): - """Test the ``_print_results`` method with errors. - - Expect that the correct messages are written. - - Input: - - out argument - - Side Effects: - - messages are written to the output. - """ - # Setup - report = QualityReport() - report._overall_quality_score = 0.7 - report._property_breakdown = { - 'Column Shapes': 0.6, - 'Column Pair Trends': np.nan, - 'Cardinality': 0.8, - } - report._property_errors = { - 'Column Shapes': 0, - 'Column Pair Trends': 1, - 'Cardinality': 0, - } - mock_out = Mock() - - # Run - report._print_results(mock_out) - - # Assert - mock_out.write.assert_has_calls([ - call('\nOverall Quality Score: 70.0%\n\n'), - call('Properties:\n'), - call('Column Shapes: 60.0%\n'), - call('Column Pair Trends: Error computing property.\n'), - call('Cardinality: 80.0%\n'), - ]) - - def test__print_result_with_all_errors(self): - """Test the ``_print_results`` method with all properties erroring out. - - Expect that the correct messages are written. - - Input: - - out argument - - Side Effects: - - messages are written to the output. - """ - # Setup - report = QualityReport() - report._overall_quality_score = np.nan - report._property_breakdown = { - 'Column Shapes': np.nan, - 'Column Pair Trends': np.nan, - 'Cardinality': np.nan - } - report._property_errors = { - 'Column Shapes': 1, - 'Column Pair Trends': 1, - 'Cardinality': 1, - } - mock_out = Mock() - - # Run - report._print_results(mock_out) - - # Assert - mock_out.write.assert_has_calls([ - call('\nOverall Quality Score: Error computing report.\n\n'), - call('Properties:\n'), - call('Column Shapes: Error computing property.\n'), - call('Column Pair Trends: Error computing property.\n'), - call('Cardinality: Error computing property.\n'), - ])