-
Notifications
You must be signed in to change notification settings - Fork 10
/
Copy pathmetrics.py
130 lines (93 loc) · 3.29 KB
/
metrics.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
import numpy as np
from sdv.evaluation import evaluate
import gower
import pandas as pd
from sdv.metrics.tabular import NumericalMLP, CategoricalSVM
# Distributional metrics - Check distribution differences between synthetic & original dataset as well as how
# Easy it is to discriminate them i.e. svc detection
def distribution_metrics(
gower_bool,
distributional_metrics,
data_supp,
synthetic_supp,
categorical_columns,
continuous_columns,
saving_filepath=None,
pre_proc_method="GMM",
):
# Define lists to contain the metrics achieved
no_metrics = len(distributional_metrics)
metrics = []
# Need these in same column order
synthetic_supp = synthetic_supp[data_supp.columns]
# Now categorical columns need to be converted to objects as SDV infers data
# types from the fields and integers/floats are treated as numerical not categorical
synthetic_supp[categorical_columns] = synthetic_supp[categorical_columns].astype(
object
)
data_supp[categorical_columns] = data_supp[categorical_columns].astype(object)
evals = evaluate(
synthetic_supp, data_supp, metrics=distributional_metrics, aggregate=False
)
# evals is a pandas dataframe of metrics - if we want to add a gower metric then we can
# save this separately
metrics = np.array(evals["raw_score"])
if gower_bool == True:
# Find the gower distance
metrics = np.append(
metrics, np.mean(gower.gower_matrix(data_supp, synthetic_supp))
)
metrics = pd.DataFrame(
data=[metrics], columns=(distributional_metrics + ["Gower"])
)
else:
metrics = pd.DataFrame(data=[metrics], columns=(distributional_metrics))
# Save these metrics into a pandas dataframe - if the user wants to
if saving_filepath != None:
metrics.to_csv(
"{}Metrics_SynthVAE_{}.csv".format(saving_filepath, pre_proc_method)
)
return metrics
# Build in some privacy metrics from SDV - TO DO!!!
def privacy_metrics(
private_variable,
data_supp,
synthetic_supp,
categorical_columns,
continuous_columns,
saving_filepath=None,
pre_proc_method="GMM",
):
if private_variable in continuous_columns:
continuous_columns = [
column for column in continuous_columns if column != private_variable
]
mlp_priv = NumericalMLP.compute(
data_supp.fillna(0),
synthetic_supp.fillna(0),
key_fields=(continuous_columns),
sensitive_fields=[private_variable],
)
return mlp_priv
elif private_variable in categorical_columns:
categorical_columns = [
column for column in categorical_columns if column != private_variable
]
svm_priv = CategoricalSVM.compute(
data_supp.fillna(0),
synthetic_supp.fillna(0),
key_fields=(categorical_columns),
sensitive_fields=[private_variable],
)
return svm_priv
# Build in some fairness metrics (will have to find a library/code these ourselves) - TO DO!!!
def fairness_metrics(
user_metrics,
data_supp,
synthetic_supp,
categorical_columns,
continuous_columns,
saving_filepath=None,
pre_proc_method="GMM",
):
return None