INRIA · ArturoAmorQ · Jun 14, 2023 · Feb 9, 2023 · Feb 13, 2023 · Feb 23, 2023
diff --git a/python_scripts/02_numerical_pipeline_scaling.py b/python_scripts/02_numerical_pipeline_scaling.py
@@ -171,8 +171,14 @@
 data_train_scaled = scaler.fit_transform(data_train)
 data_train_scaled
 
+# %% [markdown]
+# By default `StandardScaler` outputs a numpy array, but it is also possible to
+# set the output to be a pandas dataframe. This makes some data exploration
+# tasks easier, as it preserves the column names.
+
 # %%
-data_train_scaled = pd.DataFrame(data_train_scaled, columns=data_train.columns)
+scaler = StandardScaler().set_output(transform="pandas")
+data_train_scaled = scaler.fit_transform(data_train)
 data_train_scaled.describe()
 
 # %% [markdown]

diff --git a/python_scripts/03_categorical_pipeline.py b/python_scripts/03_categorical_pipeline.py
@@ -103,7 +103,7 @@
 
 education_column = data_categorical[["education"]]
 
-encoder = OrdinalEncoder()
+encoder = OrdinalEncoder().set_output(transform="pandas")
 education_encoded = encoder.fit_transform(education_column)
 education_encoded
 
@@ -168,7 +168,7 @@
 # %%
 from sklearn.preprocessing import OneHotEncoder
 
-encoder = OneHotEncoder(sparse_output=False)
+encoder = OneHotEncoder(sparse_output=False).set_output(transform="pandas")
 education_encoded = encoder.fit_transform(education_column)
 education_encoded
 
@@ -184,17 +184,8 @@
 # ```
 
 # %% [markdown]
-# We see that encoding a single feature will give a NumPy array full of zeros
-# and ones. We can get a better understanding using the associated feature names
-# resulting from the transformation.
-
-# %%
-feature_names = encoder.get_feature_names_out(input_features=["education"])
-education_encoded = pd.DataFrame(education_encoded, columns=feature_names)
-education_encoded
-
-# %% [markdown]
-# As we can see, each category (unique value) became a column; the encoding
+# We see that encoding a single feature will give a dataframe full of zeros
+# and ones. Each category (unique value) became a column; the encoding
 # returned, for each sample, a 1 to specify which category it belongs to.
 #
 # Let's apply this encoding on the full dataset.
@@ -210,14 +201,6 @@
 # %%
 print(f"The encoded dataset contains {data_encoded.shape[1]} features")
 
-# %% [markdown]
-# Let's wrap this NumPy array in a dataframe with informative column names as
-# provided by the encoder object:
-
-# %%
-columns_encoded = encoder.get_feature_names_out(data_categorical.columns)
-pd.DataFrame(data_encoded, columns=columns_encoded).head()
-
 # %% [markdown]
 # Look at how the `"workclass"` variable of the 3 first records has been encoded
 # and compare this to the original string representation.

diff --git a/python_scripts/ensemble_adaboost.py b/python_scripts/ensemble_adaboost.py
@@ -212,6 +212,13 @@
         data=penguins,
         palette=palette,
     )
+    sns.scatterplot(
+        x=culmen_columns[0],
+        y=culmen_columns[1],
+        hue=target_column,
+        data=penguins,
+        palette=palette,
+    )
     plt.legend(bbox_to_anchor=(1.04, 0.5), loc="center left")
     _ = plt.title(f"Decision tree trained at round {boosting_round}")