[docs]defone_hot_encode_data(self,data):
+ """One-hot-encodes the data."""
+ returnpd.get_dummies(data,drop_first=False,columns=self.categorical_feature_names)
+
+
[docs]defnormalize_data(self,df,encoding='one-hot'):"""Normalizes continuous features to make them fall in the range [0,1]."""result=df.copy()forfeature_nameinself.continuous_feature_names:max_value=self.permitted_range[feature_name][1]min_value=self.permitted_range[feature_name][0]
- result[feature_name]=(
- df[feature_name]-min_value)/(max_value-min_value)
+ result[feature_name]=(df[feature_name]-min_value)/(max_value-min_value)
+
+ # if encoding == 'label': # need not do this if not required
+ # for ix in self.categorical_feature_indexes:
+ # feature_name = self.feature_names[ix]
+ # max_value = len(self.categorical_levels[feature_name])-1
+ # min_value = 0
+ # result[feature_name] = (df[feature_name] - min_value) / (max_value - min_value)returnresult
[docs]defde_normalize_data(self,df):"""De-normalizes continuous features from [0,1] range to original range."""
+ iflen(df)==0:
+ returndfresult=df.copy()forfeature_nameinself.continuous_feature_names:max_value=self.permitted_range[feature_name][1]
@@ -267,8 +289,8 @@
Source code for dice_ml.data_interfaces.private_data_interface
[docs]defget_minx_maxx(self,normalized=True):"""Gets the min/max value of features in normalized or de-normalized form."""
- minx=np.array([[0.0]*len(self.encoded_feature_names)])
- maxx=np.array([[1.0]*len(self.encoded_feature_names)])
+ minx=np.array([[0.0]*len(self.ohe_encoded_feature_names)])
+ maxx=np.array([[1.0]*len(self.ohe_encoded_feature_names)])ifnormalized:returnminx,maxx
@@ -306,22 +328,51 @@
Source code for dice_ml.data_interfaces.private_data_interface
[docs]defcreate_ohe_params(self):
+ iflen(self.categorical_feature_names)>0:
+ # simulating sklearn's one-hot-encoding
+ # continuous features on the left
+ self.ohe_encoded_feature_names=[
+ featureforfeatureinself.continuous_feature_names]
+ forfeature_nameinself.categorical_feature_names:
+ forcategoryinsorted(self.categorical_levels[feature_name]):
+ self.ohe_encoded_feature_names.append(
+ feature_name+'_'+category)
+ else:
+ # one-hot-encoded data is same as original data if there is no categorical features.
+ self.ohe_encoded_feature_names=[featforfeatinself.feature_names]
+
+ self.ohe_base_df=self.prepare_df_for_ohe_encoding()# base dataframe for doing one-hot-encoding
+ # ohe_encoded_feature_names and ohe_base_df are created (and stored as data class's parameters) when get_data_params_for_gradient_dice() is called from gradient-based DiCE explainers
+
+
[docs]defget_data_params_for_gradient_dice(self):"""Gets all data related params for DiCE."""
+ self.create_ohe_params()minx,maxx=self.get_minx_maxx(normalized=True)
- # get the column indexes of categorical features after one-hot-encoding
- self.encoded_categorical_feature_indexes=self.get_encoded_categorical_feature_indexes()
+ # get the column indexes of categorical and continuous features after one-hot-encoding
+ encoded_categorical_feature_indexes=self.get_encoded_categorical_feature_indexes()
+ flattened_indexes=[itemforsublistinencoded_categorical_feature_indexesforiteminsublist]
+ encoded_continuous_feature_indexes=[ixforixinrange(len(minx[0]))ifixnotinflattened_indexes]
+
+ # min and max for continuous features in original scale
+ org_minx,org_maxx=self.get_minx_maxx(normalized=False)
+ cont_minx=list(org_minx[0][encoded_continuous_feature_indexes])
+ cont_maxx=list(org_maxx[0][encoded_continuous_feature_indexes])
+
+ # decimal precisions for continuous features
+ cont_precisions=[self.get_decimal_precisions()[ix]forixinrange(len(self.continuous_feature_names))]
+
+ returnminx,maxx,encoded_categorical_feature_indexes,encoded_continuous_feature_indexes,cont_minx,cont_maxx,cont_precisions
[docs]defget_encoded_categorical_feature_indexes(self):"""Gets the column indexes categorical features after one-hot-encoding."""cols=[]forcol_parentinself.categorical_feature_names:
- temp=[self.encoded_feature_names.index(
- col)forcolinself.encoded_feature_namesifcol.startswith(col_parent)and
+ temp=[self.ohe_encoded_feature_names.index(
+ col)forcolinself.ohe_encoded_feature_namesifcol.startswith(col_parent)andcolnotinself.continuous_feature_names]cols.append(temp)returncols
@@ -329,7 +380,7 @@
Source code for dice_ml.data_interfaces.private_data_interface
[docs]defget_indexes_of_features_to_vary(self,features_to_vary='all'):"""Gets indexes from feature names of one-hot-encoded data."""iffeatures_to_vary=="all":
- return[iforiinrange(len(self.encoded_feature_names))]
+ return[iforiinrange(len(self.ohe_encoded_feature_names))]else:ixs=[]encoded_cats_ixs=self.get_encoded_categorical_feature_indexes()
@@ -341,6 +392,18 @@
Source code for dice_ml.data_interfaces.private_data_interface
ixs.append(colidx)returnixs
+
[docs]deffrom_label(self,data):
+ """Transforms label encoded data back to categorical values"""
+ out=data.copy()
+ ifisinstance(data,pd.DataFrame)orisinstance(data,dict):
+ forcolumninself.categorical_feature_names:
+ out[column]=self.labelencoder[column].inverse_transform(out[column].round().astype(int).tolist())
+ returnout
+ elifisinstance(data,list):
+ forcolumninself.categorical_feature_indexes:
+ out[column]=self.labelencoder[self.feature_names[column]].inverse_transform([round(out[column])])[0]
+ returnout
+
[docs]deffrom_dummies(self,data,prefix_sep='_'):"""Gets the original data from dummy encoded data with k levels."""out=data.copy()
@@ -354,7 +417,7 @@
Source code for dice_ml.data_interfaces.private_data_interface
[docs]defget_decimal_precisions(self):""""Gets the precision of continuous features in the data."""
- precisions=[0]*len(self.feature_names)
+ precisions=[0]*len(self.continuous_feature_names)forix,feature_nameinenumerate(self.continuous_feature_names):type_prec=self.type_and_precision[feature_name]iftype_prec=='int':
@@ -363,65 +426,90 @@
Source code for dice_ml.data_interfaces.private_data_interface
[docs]defget_decoded_data(self,data):
- """Gets the original data from dummy encoded data."""
- ifisinstance(data,np.ndarray):
- index=[iforiinrange(0,len(data))]
+
[docs]defget_decoded_data(self,data,encoding='one-hot'):
+ """Gets the original data from encoded data."""
+ iflen(data)==0:
+ returndata
+
+ index=[iforiinrange(0,len(data))]
+ ifencoding=='one-hot':
+ ifisinstance(data,pd.DataFrame):
+ returnself.from_dummies(data)
+ elifisinstance(data,np.ndarray):
+ data=pd.DataFrame(data=data,index=index,
+ columns=self.ohe_encoded_feature_names)
+ returnself.from_dummies(data)
+ else:
+ raiseValueError("data should be a pandas dataframe or a numpy array")
+
+ elifencoding=='label':data=pd.DataFrame(data=data,index=index,
- columns=self.encoded_feature_names)
- returnself.from_dummies(data)
[docs]defprepare_df_for_ohe_encoding(self):
+ """Create base dataframe to do OHE for a single instance or a set of instances"""levels=[]
- colnames=self.categorical_feature_names
+ colnames=[featforfeatinself.categorical_feature_names]forcat_featureincolnames:levels.append(self.categorical_levels[cat_feature])
- df=pd.DataFrame({colnames[0]:levels[0]})
+ iflen(colnames)>0:
+ df=pd.DataFrame({colnames[0]:levels[0]})
+ else:
+ df=pd.DataFrame()
+
forcolinrange(1,len(colnames)):temp_df=pd.DataFrame({colnames[col]:levels[col]})df=pd.concat([df,temp_df],axis=1,sort=False)
- colnames=self.continuous_feature_names
+ colnames=[featforfeatinself.continuous_feature_names]forcolinrange(0,len(colnames)):temp_df=pd.DataFrame({colnames[col]:[]})df=pd.concat([df,temp_df],axis=1,sort=False)returndf
-
[docs]defone_hot_encode_data(self,data):
- """One-hot-encodes the data."""
- returnpd.get_dummies(data,drop_first=False,columns=self.categorical_feature_names)
-
-
[docs]defprepare_query_instance(self,query_instance,encode):
- """Prepares user defined test input for DiCE."""
-
+
[docs]defprepare_query_instance(self,query_instance):
+ """Prepares user defined test input(s) for DiCE."""ifisinstance(query_instance,list):
- query_instance={'row1':query_instance}
- test=pd.DataFrame.from_dict(
- query_instance,orient='index',columns=self.feature_names)
+ ifisinstance(query_instance[0],dict):# prepare a list of query instances
+ test=pd.DataFrame(query_instance,columns=self.feature_names)
+
+ else:# prepare a single query instance in list
+ query_instance={'row1':query_instance}
+ test=pd.DataFrame.from_dict(
+ query_instance,orient='index',columns=self.feature_names)elifisinstance(query_instance,dict):
- query_instance=dict(zip(query_instance.keys(),[[q]forqinquery_instance.values()]))
- test=pd.DataFrame(query_instance,columns=self.feature_names)
+ test=pd.DataFrame({k:[v]fork,vinquery_instance.items()},columns=self.feature_names)
- test=test.reset_index(drop=True)
+ elifisinstance(query_instance,pd.DataFrame):
+ test=query_instance.copy()
- ifencodeisFalse:
- returnself.normalize_data(test)else:
- temp=self.prepare_df_for_encoding()
- temp=temp.append(test,ignore_index=True,sort=False)
- temp=self.one_hot_encode_data(temp)
- temp=self.normalize_data(temp)
+ raiseValueError("Query instance should be a dict, a pandas dataframe, a list, or a list of dicts")
- returntemp.tail(test.shape[0]).reset_index(drop=True)
-
-
[docs]defget_dev_data(self,model_interface,desired_class,filter_threshold=0.5):
- """Constructs dev data by extracting part of the test data for which finding counterfactuals make sense."""
- raiseValueError(
- "Cannot compute dev data from only meta data information")
+ test=test.reset_index(drop=True)
+ returntest
+
+
[docs]defget_ohe_min_max_normalized_data(self,query_instance):
+ """Transforms query_instance into one-hot-encoded and min-max normalized data. query_instance should be a dict, a dataframe, a list, or a list of dicts"""
+ query_instance=self.prepare_query_instance(query_instance)
+ temp=self.ohe_base_df.append(query_instance,ignore_index=True,sort=False)
+ temp=self.one_hot_encode_data(temp)
+ temp=temp.tail(query_instance.shape[0]).reset_index(drop=True)
+ returnself.normalize_data(temp)# returns a pandas dataframe
+
+
[docs]defget_inverse_ohe_min_max_normalized_data(self,transformed_data):
+ """Transforms one-hot-encoded and min-max normalized data into raw user-fed data format. transformed_data should be a dataframe or an array"""
+ raw_data=self.get_decoded_data(transformed_data,encoding='one-hot')
+ raw_data=self.de_normalize_data(raw_data)
+ precisions=self.get_decimal_precisions()
+ forix,featureinenumerate(self.continuous_feature_names):
+ raw_data[feature]=raw_data[feature].astype(float).round(precisions[ix])
+ raw_data=raw_data[self.feature_names]
+ returnraw_data# returns a pandas dataframe
@@ -434,11 +522,19 @@
Source code for dice_ml.data_interfaces.private_data_interface
Source code for dice_ml.data_interfaces.public_data_interface
-"""Module containing all required information about the raw or transformed public data."""
+"""Module containing all required information about the interface between raw (or transformed) public data and DiCE explainers."""importpandasaspdimportnumpyasnpfromsklearn.model_selectionimporttrain_test_splitimportlogging
-
-importtensorflowastf
-fromtensorflowimportkeras
+fromcollectionsimportdefaultdict
[docs]classPublicData:
- """A data interface for public data."""
+ """A data interface for public data. This class is an interface to DiCE explainers and contains methods to transform user-fed raw data into the format a DiCE explainer requires, and vice versa."""def__init__(self,params):"""Init method
- :param dataframe: Pandas DataFrame.
+ :param dataframe: The train dataframe used by explainer method. :param continuous_features: List of names of continuous features. The remaining features are categorical features. :param outcome_name: Outcome feature name. :param permitted_range (optional): Dictionary with feature names as keys and permitted range in list as values. Defaults to the range inferred from training data.
- :param test_size (optional): Proportion of test set split. Defaults to 0.2.
- :param test_split_random_state (optional): Random state for train test split. Defaults to 17. :param continuous_features_precision (optional): Dictionary with feature names as keys and precisions as values. :param data_name (optional): Dataset name
@@ -202,7 +203,7 @@
Source code for dice_ml.data_interfaces.public_data_interface
raise ValueError("should provide the name of outcome feature")self.categorical_feature_names=[namefornameinself.data_df.columns.tolist(
- )ifnamenotinself.continuous_feature_names+[self.outcome_name]]
+ )ifnamenotinself.continuous_feature_names+[self.outcome_name]]self.feature_names=[namefornameinself.data_df.columns.tolist()ifname!=self.outcome_name]
@@ -213,19 +214,6 @@
Source code for dice_ml.data_interfaces.public_data_interface
self.categorical_feature_indexes=[self.data_df.columns.get_loc(name)fornameinself.categorical_feature_namesifnameinself.data_df]
- if'test_size'inparams:
- self.test_size=params['test_size']
- ifself.test_size>1orself.test_size<0:
- raiseValueError(
- "should provide a decimal between 0 and 1")
- else:
- self.test_size=0.2
-
- if'test_split_random_state'inparams:
- self.test_split_random_state=params['test_split_random_state']
- else:
- self.test_split_random_state=17
-
if'continuous_features_precision'inparams:self.continuous_features_precision=params['continuous_features_precision']else:
@@ -246,57 +234,59 @@
Source code for dice_ml.data_interfaces.public_data_interface
self.data_df[feature]=self.data_df[feature].astype(np.int32)
- iflen(self.categorical_feature_names)>0:
- self.one_hot_encoded_data=self.one_hot_encode_data(self.data_df)
- self.encoded_feature_names=[xforxinself.one_hot_encoded_data.columns.tolist(
- )ifxnotinnp.array([self.outcome_name])]
- else:
- # one-hot-encoded data is same as orignial data if there is no categorical features.
- self.one_hot_encoded_data=self.data_df
- self.encoded_feature_names=self.feature_names
-
- self.train_df,self.test_df=self.split_data(self.data_df)
-
+ # should move the below snippet to gradient based dice interfaces
+ # self.one_hot_encoded_data = self.one_hot_encode_data(self.data_df)
+ # self.ohe_encoded_feature_names = [x for x in self.one_hot_encoded_data.columns.tolist(
+ # ) if x not in np.array([self.outcome_name])]
+
+ # should move the below snippet to model agnostic dice interfaces
+ # # Initializing a label encoder to obtain label-encoded values for categorical variables
+ # self.labelencoder = {}
+ #
+ # self.label_encoded_data = self.data_df.copy()
+ #
+ # for column in self.categorical_feature_names:
+ # self.labelencoder[column] = LabelEncoder()
+ # self.label_encoded_data[column] = self.labelencoder[column].fit_transform(self.data_df[column])
+
+ input_permitted_range=Noneif'permitted_range'inparams:
- self.permitted_range=params['permitted_range']
- ifnotself.check_features_range():
- raiseValueError(
- "permitted range of features should be within their original range")
- else:
- self.permitted_range=self.get_features_range()
+ input_permitted_range=params['permitted_range']
+ self.permitted_range,feature_ranges_orig=self.get_features_range(input_permitted_range)
+
+ # should move the below snippet to model agnostic dice interfaces
+ # self.max_range = -np.inf
+ # for feature in self.continuous_feature_names:
+ # self.max_range = max(self.max_range, self.permitted_range[feature][1])if'data_name'inparams:self.data_name=params['data_name']else:self.data_name='mydata'
-
[docs]defget_features_range(self,permitted_range_input=None):ranges={}
+ # Getting default ranges based on the datasetforfeature_nameinself.continuous_feature_names:ranges[feature_name]=[
- self.train_df[feature_name].min(),self.train_df[feature_name].max()]
- returnranges
+ self.data_df[feature_name].min(),self.data_df[feature_name].max()]
+ forfeature_nameinself.categorical_feature_names:
+ ranges[feature_name]=self.data_df[feature_name].unique().tolist()
+ feature_ranges_orig=ranges.copy()
+ # Overwriting the ranges for a feature if input provided
+ ifpermitted_range_inputisnotNone:
+ forfeature_name,feature_rangeinpermitted_range_input.items():
+ ranges[feature_name]=feature_range
+ returnranges,feature_ranges_orig
[docs]defget_data_type(self,col):
- """Infers data type of a feature from the training data."""
- if((self.data_df[col].dtype==np.int64)or(self.data_df[col].dtype==np.int32)):
+ """Infers data type of a continuous feature from the training data."""
+ if((self.data_df[col].dtype==np.int64)or(self.data_df[col].dtype==np.int32)):return'int'
- elif((self.data_df[col].dtype==np.float64)or(self.data_df[col].dtype==np.float32)):
+ elif((self.data_df[col].dtype==np.float64)or(self.data_df[col].dtype==np.float32)):return'float'else:
- raiseValueError("Unknown data type of feature %s: must be int or float"%col)
+ raiseValueError("Unknown data type of feature %s: must be int or float"%col)
[docs]defone_hot_encode_data(self,data):"""One-hot-encodes the data."""
@@ -306,30 +296,69 @@
Source code for dice_ml.data_interfaces.public_data_interface
"""Normalizes continuous features to make them fall in the range [0,1]."""
result=df.copy()forfeature_nameinself.continuous_feature_names:
- max_value=self.train_df[feature_name].max()
- min_value=self.train_df[feature_name].min()
+ max_value=self.data_df[feature_name].max()
+ min_value=self.data_df[feature_name].min()result[feature_name]=(
- df[feature_name]-min_value)/(max_value-min_value)
+ df[feature_name]-min_value)/(max_value-min_value)
+ #if encoding == 'label':
+ # for ix in self.categorical_feature_indexes:
+ # feature_name = self.feature_names[ix]
+ # max_value = len(self.train_df[feature_name].unique())-1
+ # min_value = 0
+ # result[feature_name] = (df[feature_name] - min_value) / (max_value - min_value)returnresult
[docs]defde_normalize_data(self,df):"""De-normalizes continuous features from [0,1] range to original range."""
+ iflen(df)==0:
+ returndfresult=df.copy()forfeature_nameinself.continuous_feature_names:
- max_value=self.train_df[feature_name].max()
- min_value=self.train_df[feature_name].min()
+ max_value=self.data_df[feature_name].max()
+ min_value=self.data_df[feature_name].min()result[feature_name]=(
- df[feature_name]*(max_value-min_value))+min_value
+ df[feature_name]*(max_value-min_value))+min_valuereturnresult
+
[docs]defget_valid_feature_range(self,feature_range_input,normalized=True):
+ """Gets the min/max value of features in normalized or de-normalized
+ form. Assumes that all features are already encoded to numerical form
+ such that the number of features remains the same.
+
+ # TODO needs work adhere to label encoded max and to support permitted_range for
+ both continuous and discrete when provided in _generate_counterfactuals.
+ """
+ feature_range={}
+
+ foridx,feature_nameinenumerate(self.feature_names):
+ feature_range[feature_name]=[]
+ iffeature_nameinself.continuous_feature_names:
+ max_value=self.data_df[feature_name].max()
+ min_value=self.data_df[feature_name].min()
+
+ ifnormalized:
+ minx=(feature_range_input[feature_name]
+ [0]-min_value)/(max_value-min_value)
+ maxx=(feature_range_input[feature_name]
+ [1]-min_value)/(max_value-min_value)
+ else:
+ minx=feature_range_input[feature_name][0]
+ maxx=feature_range_input[feature_name][1]
+ feature_range[feature_name].append(minx)
+ feature_range[feature_name].append(maxx)
+ else:
+ # categorical features
+ feature_range[feature_name]=feature_range_input[feature_name]
+ returnfeature_range
+
[docs]defget_minx_maxx(self,normalized=True):"""Gets the min/max value of features in normalized or de-normalized form."""
- minx=np.array([[0.0]*len(self.encoded_feature_names)])
- maxx=np.array([[1.0]*len(self.encoded_feature_names)])
+ minx=np.array([[0.0]*len(self.ohe_encoded_feature_names)])
+ maxx=np.array([[1.0]*len(self.ohe_encoded_feature_names)])foridx,feature_nameinenumerate(self.continuous_feature_names):
- max_value=self.train_df[feature_name].max()
- min_value=self.train_df[feature_name].min()
+ max_value=self.data_df[feature_name].max()
+ min_value=self.data_df[feature_name].min()ifnormalized:minx[0][idx]=(self.permitted_range[feature_name]
@@ -340,22 +369,35 @@
Source code for dice_ml.data_interfaces.public_data_interface
[docs]defget_mads(self,normalized=False):"""Computes Median Absolute Deviation of features."""
-
mads={}ifnormalizedisFalse:forfeatureinself.continuous_feature_names:mads[feature]=np.median(
- abs(self.train_df[feature].values-np.median(self.train_df[feature].values)))
+ abs(self.data_df[feature].values-np.median(self.data_df[feature].values)))else:
- normalized_train_df=self.normalize_data(self.train_df)
+ normalized_train_df=self.normalize_data(self.data_df)forfeatureinself.continuous_feature_names:mads[feature]=np.median(abs(normalized_train_df[feature].values-np.median(normalized_train_df[feature].values)))
@@ -369,6 +411,7 @@
Source code for dice_ml.data_interfaces.public_data_interface
mads[feature]=1.0ifdisplay_warnings:logging.warning(" MAD for feature %s is 0, so replacing it with 1.0 to avoid error.",feature)
+
ifreturn_mads:returnmads
@@ -379,49 +422,87 @@
Source code for dice_ml.data_interfaces.public_data_interface
[docs]defcreate_ohe_params(self):
+ iflen(self.categorical_feature_names)>0:
+ one_hot_encoded_data=self.one_hot_encode_data(self.data_df)
+ self.ohe_encoded_feature_names=[xforxinone_hot_encoded_data.columns.tolist(
+ )ifxnotinnp.array([self.outcome_name])]
+ else:
+ # one-hot-encoded data is same as original data if there is no categorical features.
+ self.ohe_encoded_feature_names=[featforfeatinself.feature_names]
+
+ self.ohe_base_df=self.prepare_df_for_ohe_encoding()# base dataframe for doing one-hot-encoding
+ # ohe_encoded_feature_names and ohe_base_df are created (and stored as data class's parameters) when get_data_params_for_gradient_dice() is called from gradient-based DiCE explainers
+
+
[docs]defget_data_params_for_gradient_dice(self):"""Gets all data related params for DiCE."""
+ self.create_ohe_params()minx,maxx=self.get_minx_maxx(normalized=True)
- # get the column indexes of categorical features after one-hot-encoding
- self.encoded_categorical_feature_indexes=self.get_encoded_categorical_feature_indexes()
+ # get the column indexes of categorical and continuous features after one-hot-encoding
+ encoded_categorical_feature_indexes=self.get_encoded_categorical_feature_indexes()
+ flattened_indexes=[itemforsublistinencoded_categorical_feature_indexesforiteminsublist]
+ encoded_continuous_feature_indexes=[ixforixinrange(len(minx[0]))ifixnotinflattened_indexes]
+
+ # min and max for continuous features in original scale
+ org_minx,org_maxx=self.get_minx_maxx(normalized=False)
+ cont_minx=list(org_minx[0][encoded_continuous_feature_indexes])
+ cont_maxx=list(org_maxx[0][encoded_continuous_feature_indexes])
- returnminx,maxx,self.encoded_categorical_feature_indexes
+ # decimal precisions for continuous features
+ cont_precisions=[self.get_decimal_precisions()[ix]forixinrange(len(self.continuous_feature_names))]
+
+ returnminx,maxx,encoded_categorical_feature_indexes,encoded_continuous_feature_indexes,cont_minx,cont_maxx,cont_precisions
[docs]defget_encoded_categorical_feature_indexes(self):"""Gets the column indexes categorical features after one-hot-encoding."""cols=[]forcol_parentinself.categorical_feature_names:
- temp=[self.encoded_feature_names.index(
- col)forcolinself.encoded_feature_namesifcol.startswith(col_parent)and
- colnotinself.continuous_feature_names]
+ temp=[self.ohe_encoded_feature_names.index(
+ col)forcolinself.ohe_encoded_feature_namesifcol.startswith(col_parent)and
+ colnotinself.continuous_feature_names]cols.append(temp)returncols
[docs]defget_indexes_of_features_to_vary(self,features_to_vary='all'):"""Gets indexes from feature names of one-hot-encoded data."""
+ # TODO: add encoding as a parameter and use the function get_indexes_of_features_to_vary for label encoding tooiffeatures_to_vary=="all":
- return[iforiinrange(len(self.encoded_feature_names))]
+ return[iforiinrange(len(self.ohe_encoded_feature_names))]else:ixs=[]encoded_cats_ixs=self.get_encoded_categorical_feature_indexes()encoded_cats_ixs=[itemforsublistinencoded_cats_ixsforiteminsublist]
- forcolidx,colinenumerate(self.encoded_feature_names):
+ forcolidx,colinenumerate(self.ohe_encoded_feature_names):ifcolidxinencoded_cats_ixsandcol.startswith(tuple(features_to_vary)):ixs.append(colidx)elifcolidxnotinencoded_cats_ixsandcolinfeatures_to_vary:ixs.append(colidx)returnixs
+
[docs]deffrom_label(self,data):
+ """Transforms label encoded data back to categorical values"""
+ out=data.copy()
+ ifisinstance(data,pd.DataFrame)orisinstance(data,dict):
+ forcolumninself.categorical_feature_names:
+ out[column]=self.labelencoder[column].inverse_transform(out[column].round().astype(int).tolist())
+ returnout
+ elifisinstance(data,list):
+ forcinself.categorical_feature_indexes:
+ out[c]=self.labelencoder[self.feature_names[c]].inverse_transform([round(out[c])])[0]
+ returnout
+
[docs]deffrom_dummies(self,data,prefix_sep='_'):"""Gets the original data from dummy encoded data with k levels."""out=data.copy()
@@ -429,8 +510,10 @@
Source code for dice_ml.data_interfaces.public_data_interface
# first, derive column names in the one-hot-encoded data from the original data
cat_col_values=[]forvalinlist(self.data_df[feat].unique()):
- cat_col_values.append(feat+prefix_sep+str(val))# join original feature name and its unique values , ex: education_school
- match_cols=[cforcindata.columnsifcincat_col_values]# check for the above matching columns in the encoded data
+ cat_col_values.append(feat+prefix_sep+str(
+ val))# join original feature name and its unique values , ex: education_school
+ match_cols=[cforcindata.columnsif
+ cincat_col_values]# check for the above matching columns in the encoded data# then, recreate original data by removing the suffixes - based on the GitHub issue comment: https://github.com/pandas-dev/pandas/issues/8745#issuecomment-417861271cols,labs=[[c.replace(
@@ -440,35 +523,54 @@
Source code for dice_ml.data_interfaces.public_data_interface
[docs]defget_decimal_precisions(self,output_type="list"):""""Gets the precision of continuous features in the data."""# if the precision of a continuous feature is not given, we use the maximum precision of the modes to capture the precision of majority of values in the column.
- precisions=[0]*len(self.feature_names)
+ precisions_dict=defaultdict(int)
+ precisions=[0]*len(self.feature_names)forix,colinenumerate(self.continuous_feature_names):
- if((self.continuous_features_precisionisnotNone)and(colinself.continuous_features_precision)):
+ if((self.continuous_features_precisionisnotNone)and(colinself.continuous_features_precision)):precisions[ix]=self.continuous_features_precision[col]
- elif((self.data_df[col].dtype==np.float32)or(self.data_df[col].dtype==np.float64)):
+ precisions_dict[col]=self.continuous_features_precision[col]
+ elif((self.data_df[col].dtype==np.float32)or(self.data_df[col].dtype==np.float64)):modes=self.data_df[col].mode()
- maxp=len(str(modes[0]).split('.')[1])# maxp stores the maximum precision of the modes
+ maxp=len(str(modes[0]).split('.')[1])# maxp stores the maximum precision of the modesformxinrange(len(modes)):prec=len(str(modes[mx]).split('.')[1])ifprec>maxp:maxp=precprecisions[ix]=maxp
- returnprecisions
[docs]defget_decoded_data(self,data,encoding='one-hot'):
+ """Gets the original data from encoded data."""
+ iflen(data)==0:
+ returndata
+
+ index=[iforiinrange(0,len(data))]
+ ifencoding=='one-hot':
+ ifisinstance(data,pd.DataFrame):
+ returnself.from_dummies(data)
+ elifisinstance(data,np.ndarray):
+ data=pd.DataFrame(data=data,index=index,
+ columns=self.ohe_encoded_feature_names)
+ returnself.from_dummies(data)
+ else:
+ raiseValueError("data should be a pandas dataframe or a numpy array")
-
[docs]defget_decoded_data(self,data):
- """Gets the original data from dummy encoded data."""
- ifisinstance(data,np.ndarray):
- index=[iforiinrange(0,len(data))]
+ elifencoding=='label':data=pd.DataFrame(data=data,index=index,
- columns=self.encoded_feature_names)
- returnself.from_dummies(data)
[docs]defprepare_df_for_ohe_encoding(self):
+ """Create base dataframe to do OHE for a single instance or a set of instances"""levels=[]
- colnames=self.categorical_feature_names
+ colnames=[featforfeatinself.categorical_feature_names]forcat_featureincolnames:levels.append(self.data_df[cat_feature].cat.categories.tolist())
@@ -481,85 +583,67 @@
Source code for dice_ml.data_interfaces.public_data_interface
[docs]defget_dev_data(self,model_interface,desired_class,filter_threshold=0.5):
- """Constructs dev data by extracting part of the test data for which finding counterfactuals make sense."""
+ elifisinstance(query_instance,pd.DataFrame):
+ test=query_instance.copy()
- # create TensorFLow session if one is not already created
- iftf.get_default_session()isnotNone:
- self.data_sess=tf.get_default_session()else:
- self.data_sess=tf.InteractiveSession()
-
- # loading trained model
- model_interface.load_model()
+ raiseValueError("Query instance should be a dict, a pandas dataframe, a list, or a list of dicts")
- # get the permitted range of change for each feature
- minx,maxx=self.get_minx_maxx(normalized=True)
-
- # get the transformed data: continuous features are normalized to fall in the range [0,1], and categorical features are one-hot encoded
- data_df_transformed=self.normalize_data(self.one_hot_encoded_data)
-
- # split data - nomralization considers only train df and there is no leakage due to transformation before train-test splitting
- _,test=self.split_data(data_df_transformed)
- test=test.drop_duplicates(
- subset=self.encoded_feature_names).reset_index(drop=True)
-
- # finding target predicted probabilities
- input_tensor=tf.Variable(minx,dtype=tf.float32)
- output_tensor=model_interface.get_output(
- input_tensor)# model(input_tensor)
- temp_data=test[self.encoded_feature_names].values.astype(np.float32)
- dev_preds=[self.data_sess.run(output_tensor,feed_dict={
- input_tensor:np.array([dt])})fordtintemp_data]
- dev_preds=[dev_preds[i][0][0]foriinrange(len(dev_preds))]
-
- # filtering examples which have predicted value >/< threshold
- dev_data=test[self.encoded_feature_names]
- ifdesired_class==0:
- idxs=[iforiinrange(len(dev_preds))
- ifdev_preds[i]>filter_threshold]
- else:
- idxs=[iforiinrange(len(dev_preds))
- ifdev_preds[i]<filter_threshold]
- dev_data=dev_data.iloc[idxs]
- dev_preds=[dev_preds[i]foriinidxs]
-
- # convert from one-hot encoded vals to user interpretable fromat
- dev_data=self.from_dummies(dev_data)
- dev_data=self.de_normalize_data(dev_data)
- returndev_data[self.feature_names],dev_preds# values.tolist()
+ test=test.reset_index(drop=True)
+ returntest
+
+ # TODO: create a new method, get_LE_min_max_normalized_data() to get label-encoded and normalized data. Keep this method only for converting query_instance to pd.DataFrame
+ # if encoding == 'label':
+ # for column in self.categorical_feature_names:
+ # test[column] = self.labelencoder[column].transform(test[column])
+ # return self.normalize_data(test, encoding)
+ #
+ # elif encoding == 'one-hot':
+ # temp = self.prepare_df_for_encoding()
+ # temp = temp.append(test, ignore_index=True, sort=False)
+ # temp = self.one_hot_encode_data(temp)
+ # temp = self.normalize_data(temp)
+ #
+ # return temp.tail(test.shape[0]).reset_index(drop=True)
+
+
[docs]defget_ohe_min_max_normalized_data(self,query_instance):
+ """Transforms query_instance into one-hot-encoded and min-max normalized data. query_instance should be a dict, a dataframe, a list, or a list of dicts"""
+ query_instance=self.prepare_query_instance(query_instance)
+ temp=self.ohe_base_df.append(query_instance,ignore_index=True,sort=False)
+ temp=self.one_hot_encode_data(temp)
+ temp=temp.tail(query_instance.shape[0]).reset_index(drop=True)
+ returnself.normalize_data(temp)# returns a pandas dataframe
+
+
[docs]defget_inverse_ohe_min_max_normalized_data(self,transformed_data):
+ """Transforms one-hot-encoded and min-max normalized data into raw user-fed data format. transformed_data should be a dataframe or an array"""
+ raw_data=self.get_decoded_data(transformed_data,encoding='one-hot')
+ raw_data=self.de_normalize_data(raw_data)
+ precisions=self.get_decimal_precisions()
+ forix,featureinenumerate(self.continuous_feature_names):
+ raw_data[feature]=raw_data[feature].astype(float).round(precisions[ix])
+ raw_data=raw_data[self.feature_names]
+ returnraw_data# returns a pandas dataframe
@@ -572,11 +656,19 @@
Source code for dice_ml.data_interfaces.public_data_interface
-"""Module pointing to different implementations of DiCE based on different frameworks such as Tensorflow or PyTorch."""
-
-importtensorflowastf
-
+"""Module pointing to different implementations of DiCE based on different frameworks such as Tensorflow or PyTorch or sklearn, and different methods such as RandomSampling, DiCEKD or DiCEGenetic"""
[docs]classDice:"""An interface class to different DiCE implementations."""
- def__init__(self,data_interface,model_interface,**kwargs):
+ def__init__(self,data_interface,model_interface,method="random",**kwargs):"""Init method :param data_interface: an interface to access data related params. :param model_interface: an interface to access the output or gradients of a trained ML model.
+ :param method: Name of the method to use for generating counterfactuals """
- self.decide_implementation_type(data_interface,model_interface,**kwargs)
+ self.decide_implementation_type(data_interface,model_interface,method,**kwargs)
-
# To add new implementations of DiCE, add the class in explainer_interfaces subpackage and import-and-return the class in an elif loop as shown in the below method.
-
[docs]defvisualize_as_dataframe(self,display_sparse_df=True,show_only_changes=False):
-
# original instanceprint('Query instance (original outcome : %i)'%round(self.test_pred))
- display(self.org_instance)# works only in Jupyter notebook
-
- ifself.posthoc_sparsity_param==None:
- print('\nCounterfactual set (new outcome : %i)'%(self.new_outcome))
- self.display_df(self.final_cfs_df,show_only_changes)
-
- elif'data_df'inself.data_interface.__dict__anddisplay_sparse_df==Trueandself.final_cfs_sparseisnotNone:
- # CFs
- print('\nDiverse Counterfactual set (new outcome : %i)'%(self.new_outcome))
- self.display_df(self.final_cfs_df_sparse,show_only_changes)
-
- elif'data_df'inself.data_interface.__dict__anddisplay_sparse_df==Trueandself.final_cfs_sparseisNone:
- print('\nPlease specify a valid posthoc_sparsity_param to perform sparsity correction.. displaying Diverse Counterfactual set without sparsity correction (new outcome : %i)'%(self.new_outcome))
- self.display_df(self.final_cfs_df,show_only_changes)
-
- elif'data_df'notinself.data_interface.__dict__:# for private data
- print('\nDiverse Counterfactual set without sparsity correction since only metadata about each feature is available (new outcome : %i)'%(self.new_outcome))
- self.display_df(self.final_cfs_df,show_only_changes)
-
+ display(self.test_instance_df)# works only in Jupyter notebook
+ ifself.final_cfs_dfisnotNoneandlen(self.final_cfs_df)>0:
+ ifself.posthoc_sparsity_param==None:
+ print('\nCounterfactual set (new outcome: {0})'.format(self.new_outcome))
+ self.display_df(self.final_cfs_df,show_only_changes)
+
+ elifhasattr(self.data_interface,'data_df')anddisplay_sparse_df==Trueandself.final_cfs_df_sparseisnotNone:
+ # CFs
+ print('\nDiverse Counterfactual set (new outcome: {0})'.format(self.new_outcome))
+ self.display_df(self.final_cfs_df_sparse,show_only_changes)
+
+
+ elifhasattr(self.data_interface,'data_df')anddisplay_sparse_df==Trueandself.final_cfs_df_sparseisNone:
+ print('\nPlease specify a valid posthoc_sparsity_param to perform sparsity correction.. displaying Diverse Counterfactual set without sparsity correction (new outcome : %i)'%(self.new_outcome))
+ self.display_df(self.final_cfs_df,show_only_changes)
+
+ elifnothasattr(self.data_interface,'data_df'):# for private data
+ print('\nDiverse Counterfactual set without sparsity correction since only metadata about each feature is available (new outcome: ',self.new_outcome)
+ self.display_df(self.final_cfs_df,show_only_changes)
+
+ else:
+ # CFs
+ print('\nDiverse Counterfactual set without sparsity correction (new outcome: ',self.new_outcome)
+ self.display_df(self.final_cfs_df,show_only_changes)else:
- # CFs
- print('\nDiverse Counterfactual set without sparsity correction (new outcome : %i)'%(self.new_outcome))
- self.display_df(self.final_cfs_df,show_only_changes)
+ print('\nNo counterfactuals found!')
[docs]defdisplay_df(self,df,show_only_changes):ifshow_only_changesisFalse:
- display(df)# works only in Jupyter notebook
+ display(df)# works only in Jupyter notebookelse:newdf=df.values.tolist()
- org=self.org_instance.values.tolist()[0]
+ org=self.test_instance_df.values.tolist()[0]forixinrange(df.shape[0]):forjxinrange(len(org)):ifnewdf[ix][jx]==org[jx]:newdf[ix][jx]='-'else:newdf[ix][jx]=str(newdf[ix][jx])
- display(pd.DataFrame(newdf,columns=df.columns))# works only in Jupyter notebook
+ display(pd.DataFrame(newdf,columns=df.columns))# works only in Jupyter notebook
[docs]defvisualize_as_list(self,display_sparse_df=True,show_only_changes=False):# original instanceprint('Query instance (original outcome : %i)'%round(self.test_pred))
- print(self.org_instance.values.tolist()[0])
-
- ifself.posthoc_sparsity_param==None:
- print('\nCounterfactual set (new outcome : %i)'%(self.new_outcome))
- self.print_list(self.final_cfs_df,show_only_changes)
-
- elif'data_df'inself.data_interface.__dict__anddisplay_sparse_df==Trueandself.final_cfs_sparseisnotNone:
- # CFs
- print('\nDiverse Counterfactual set (new outcome : %i)'%(self.new_outcome))
- self.print_list(self.final_cfs_list_sparse,show_only_changes)
-
- elif'data_df'inself.data_interface.__dict__anddisplay_sparse_df==Trueandself.final_cfs_sparseisNone:
- print('\nPlease specify a valid posthoc_sparsity_param to perform sparsity correction.. displaying Diverse Counterfactual set without sparsity correction (new outcome : %i)'%(self.new_outcome))
- self.print_list(self.final_cfs_list_sparse,show_only_changes)
-
- elif'data_df'notinself.data_interface.__dict__:# for private data
- print('\nDiverse Counterfactual set without sparsity correction since only metadata about each feature is available (new outcome : %i)'%(self.new_outcome))
- self.print_list(self.final_cfs_list,show_only_changes)
-
+ print(self.test_instance_df.values.tolist()[0])
+
+ iflen(self.final_cfs)>0:
+ ifself.posthoc_sparsity_param==None:
+ print('\nCounterfactual set (new outcome : %i)'%(self.new_outcome))
+ self.print_list(self.final_cfs_df.values.tolist(),show_only_changes)
+
+ elifhasattr(self.data_interface,'data_df')anddisplay_sparse_df==Trueandself.final_cfs_df_sparseisnotNone:
+ # CFs
+ print('\nDiverse Counterfactual set (new outcome : %i)'%(self.new_outcome))
+ self.print_list(self.final_cfs_df_sparse.values.tolist(),show_only_changes)
+
+ elifhasattr(self.data_interface,'data_df')anddisplay_sparse_df==Trueandself.final_cfs_df_sparseisNone:
+ print('\nPlease specify a valid posthoc_sparsity_param to perform sparsity correction.. displaying Diverse Counterfactual set without sparsity correction (new outcome : %i)'%(self.new_outcome))
+ self.print_list(self.final_cfs_df.values.tolist(),show_only_changes)
+
+ elifnothasattr(self.data_interface,'data_df'):# for private data
+ print('\nDiverse Counterfactual set without sparsity correction since only metadata about each feature is available (new outcome : %i)'%(self.new_outcome))
+ self.print_list(self.final_cfs_df.values.tolist(),show_only_changes)
+
+ else:
+ # CFs
+ print('\nDiverse Counterfactual set without sparsity correction (new outcome : %i)'%(self.new_outcome))
+ self.print_list(self.final_cfs_df.values.tolist(),show_only_changes)else:
- # CFs
- print('\nDiverse Counterfactual set without sparsity correction (new outcome : %i)'%(self.new_outcome))
- self.print_list(self.final_cfs_list,show_only_changes)
Source code for dice_ml.explainer_interfaces.dice_KD
+"""
+Module to generate counterfactual explanations from a KD-Tree
+This code is similar to 'Interpretable Counterfactual Explanations Guided by Prototypes': https://arxiv.org/pdf/1907.02584.pdf
+"""
+fromdice_ml.explainer_interfaces.explainer_baseimportExplainerBase
+importnumpyasnp
+importtimeit
+importpandasaspd
+importcopy
+importrandom
+
+fromdice_mlimportdiverse_counterfactualsasexp
+
+
+
[docs]classDiceKD(ExplainerBase):
+
+ def__init__(self,data_interface,model_interface):
+ """Init method
+
+ :param data_interface: an interface class to access data related params.
+ :param model_interface: an interface class to access trained ML model.
+
+ """
+ self.total_random_inits=0
+ super().__init__(data_interface)# initiating data related parameters
+
+ # As DiCE KD uses one-hot-encoding
+ self.data_interface.create_ohe_params()
+
+ # initializing model variables
+ self.model=model_interface
+ self.model.load_model()# loading pickled trained model if applicable
+ self.model.transformer.feed_data_params(data_interface)
+ self.model.transformer.initialize_transform_func()
+
+ # loading trained model
+ self.model.load_model()
+
+ # number of output nodes of ML model
+ ifself.model.model_type=='classifier':
+ self.num_output_nodes=self.model.get_num_output_nodes2(
+ self.data_interface.data_df[0:1][self.data_interface.feature_names])
+
+ self.predicted_outcome_name=self.data_interface.outcome_name+'_pred'
+
+ def_generate_counterfactuals(self,query_instance,total_CFs,desired_range=None,desired_class="opposite",
+ features_to_vary="all",
+ permitted_range=None,sparsity_weight=1,
+ feature_weights="inverse_mad",stopping_threshold=0.5,posthoc_sparsity_param=0.1,
+ posthoc_sparsity_algorithm="linear",verbose=False):
+ """Generates diverse counterfactual explanations
+
+ :param query_instance: A dictionary of feature names and values. Test point of interest.
+ :param total_CFs: Total number of counterfactuals required.
+ :param desired_range: For regression problems. Contains the outcome range to generate counterfactuals in.
+ :param desired_class: Desired counterfactual class - can take 0 or 1. Default value is "opposite" to the outcome class of query_instance for binary classification.
+ :param features_to_vary: Either a string "all" or a list of feature names to vary.
+ :param permitted_range: Dictionary with continuous feature names as keys and permitted min-max range in list as values. Defaults to the range inferred from training data. If None, uses the parameters initialized in data_interface.
+ :param sparsity_weight: Parameter to determine how much importance to give to sparsity
+ :param feature_weights: Either "inverse_mad" or a dictionary with feature names as keys and corresponding weights as values. Default option is "inverse_mad" where the weight for a continuous feature is the inverse of the Median Absolute Devidation (MAD) of the feature's values in the training set; the weight for a categorical feature is equal to 1 by default.
+ :param stopping_threshold: Minimum threshold for counterfactuals target class probability.
+ :param posthoc_sparsity_param: Parameter for the post-hoc operation on continuous features to enhance sparsity.
+ :param posthoc_sparsity_algorithm: Perform either linear or binary search. Takes "linear" or "binary". Prefer binary search when a feature range is large (for instance, income varying from 10k to 1000k) and only if the features share a monotonic relationship with predicted outcome in the model.
+ :param verbose: Parameter to determine whether to print 'Diverse Counterfactuals found!'
+
+ :return: A CounterfactualExamples object to store and visualize the resulting counterfactual explanations (see diverse_counterfactuals.py).
+ """
+ data_df_copy=self.data_interface.data_df.copy()
+
+ features_to_vary=self.setup(features_to_vary,permitted_range,query_instance,feature_weights)
+
+ # Prepares user defined query_instance for DiCE.
+ query_instance_orig=query_instance.copy()
+ query_instance=self.data_interface.prepare_query_instance(query_instance=query_instance)
+
+ # find the predicted value of query_instance
+ test_pred=self.predict_fn(query_instance)[0]
+
+ query_instance[self.data_interface.outcome_name]=test_pred
+
+ ifdesired_range!=None:
+ ifdesired_range[0]>desired_range[1]:
+ raiseValueError("Invalid Range!")
+
+ ifdesired_class=="opposite"andself.model.model_type=='classifier':
+ ifself.num_output_nodes==2:
+ desired_class=1.0-test_pred
+
+ elifself.num_output_nodes>2:
+ raiseValueError("Desired class can't be opposite if the number of classes is more than 2.")
+
+ ifisinstance(desired_class,int)anddesired_class>self.num_output_nodes-1:
+ raiseValueError("Desired class should be within 0 and num_classes-1.")
+
+ # Partitioned dataset and KD Tree for each class (binary) of the dataset
+ self.dataset_with_predictions,self.KD_tree,self.predictions=self.build_KD_tree(data_df_copy,desired_range,
+ desired_class,self.predicted_outcome_name)
+
+ query_instance,final_cfs,cfs_preds=self.find_counterfactuals(data_df_copy,
+ query_instance,query_instance_orig,
+ desired_range,
+ desired_class,
+ total_CFs,features_to_vary,
+ permitted_range,
+ sparsity_weight,
+ stopping_threshold,
+ posthoc_sparsity_param,
+ posthoc_sparsity_algorithm,verbose)
+
+ returnexp.CounterfactualExamples(data_interface=self.data_interface,
+ final_cfs_df=final_cfs,
+ test_instance_df=query_instance,
+ final_cfs_df_sparse=self.final_cfs_sparse,
+ posthoc_sparsity_param=posthoc_sparsity_param,
+ desired_range=desired_range,
+ desired_class=desired_class,
+ model_type=self.model.model_type)
+
+
[docs]deffind_counterfactuals(self,data_df_copy,query_instance,query_instance_orig,desired_range,desired_class,
+ total_CFs,features_to_vary,permitted_range,
+ sparsity_weight,stopping_threshold,posthoc_sparsity_param,posthoc_sparsity_algorithm,
+ verbose):
+ """Finds counterfactuals by querying a K-D tree for the nearest data points in the desired class from the dataset."""
+
+ self.stopping_threshold=stopping_threshold
+ ifself.model.model_type=='classifier':
+ self.target_cf_class=np.array([[desired_class]],dtype=np.float32)
+ elifself.model.model_type=='regressor':
+ self.target_cf_range=desired_range
+
+ ifself.model.model_type=='classifier':
+ ifself.target_cf_class==0andself.stopping_threshold>0.5:
+ self.stopping_threshold=0.25
+ elifself.target_cf_class==1andself.stopping_threshold<0.5:
+ self.stopping_threshold=0.75
+
+ start_time=timeit.default_timer()
+
+ # Making the one-hot-encoded version of query instance match the one-hot encoded version of the dataset
+ query_instance_df_dummies=pd.get_dummies(query_instance_orig)
+ forcolinpd.get_dummies(data_df_copy[self.data_interface.feature_names]).columns:
+ ifcolnotinquery_instance_df_dummies.columns:
+ query_instance_df_dummies[col]=0
+
+ final_cfs,cfs_preds=self.vary_valid(query_instance_df_dummies,
+ total_CFs,
+ features_to_vary,
+ permitted_range,
+ query_instance_orig,
+ sparsity_weight)
+
+ total_cfs_found=len(final_cfs)
+ iftotal_cfs_found>0:
+ # post-hoc operation on continuous features to enhance sparsity - only for public data
+ ifposthoc_sparsity_param!=Noneandposthoc_sparsity_param>0and'data_df'inself.data_interface.__dict__:
+ final_cfs_sparse=copy.deepcopy(final_cfs)
+ self.final_cfs_sparse=self.do_posthoc_sparsity_enhancement(final_cfs_sparse,query_instance,
+ posthoc_sparsity_param,
+ posthoc_sparsity_algorithm)
+ else:
+ self.final_cfs_sparse=None
+ else:
+ self.final_cfs_sparse=None
+
+ iftotal_cfs_found>0:
+ # to display the values with the same precision as the original data
+ precisions=self.data_interface.get_decimal_precisions()
+ forix,featureinenumerate(self.data_interface.continuous_feature_names):
+ final_cfs[feature]=final_cfs[feature].astype(float).round(precisions[ix])
+ self.final_cfs_sparse[feature]=self.final_cfs_sparse[feature].astype(float).round(precisions[ix])
+
+ self.elapsed=timeit.default_timer()-start_time
+
+ m,s=divmod(self.elapsed,60)
+
+ ifverbose:
+ iftotal_cfs_found<total_CFs:
+ self.elapsed=timeit.default_timer()-start_time
+ m,s=divmod(self.elapsed,60)
+ print(
+ 'Only %d (required %d) Diverse Counterfactuals found for the given configuation, perhaps change the query instance or the features to vary...'%(
+ total_cfs_found,total_CFs),'; total time taken: %02d'%m,'min %02d'%s,'sec')
+ else:
+ print('Diverse Counterfactuals found! total time taken: %02d'%m,'min %02d'%s,'sec')
+
+ returnquery_instance,final_cfs,cfs_preds