-
Notifications
You must be signed in to change notification settings - Fork 9
/
Copy pathmain.py
448 lines (299 loc) · 20.9 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
import logging
from data_ingestion.data_loader import DataLoad
from data_preprocessing.data_preprocessing import DataPreprocessor
from algorithms.linear_models import LinearRegressionWithFeatureSelection, Lasso
from algorithms.tree_models import TreeModelsReg
from evaluation.evaluation import Metrics
import pandas as pd
import warnings
import joblib
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', 10)
"""Source of the dataset"""
data = r'dataset\concrete_data.csv '
"""Application logging"""
logging.basicConfig(filename='development_logs.log', level=logging.INFO,
format='%(levelname)s:%(asctime)s:%(message)s') # configuring logging operations
# logging operation
logging.info("Hello all, welcome to the development process :-). This entire development process runs in the main.py ")
"""Loading the dataset"""
d = DataLoad(data) # An object which is responsible for the getting data from the dataset
df = d.fetch_data() # Output is nothing but the data in the form of pandas dataframe
"""Data Preprocessing"""
dp = DataPreprocessor(df) # An object which is responsible for data preprocessing
df = dp.rem_outliers('age') # As per the EDA, 'age' column is having more number of outliers. Let's remove them.
"""Splitting the data into train and test sets respectively"""
df_train, df_test = dp.data_split(test_size=0.30) # splitting the dataframe into train and test sets respectively.
"""Feature scaling for linear regression"""
df_train_sc, df_test_sc = dp.feature_scaling(df_train, df_test) # feature scaling using StandardScaler for linear
# regression.
"""Splitting datasets into independent and dependent datasets respectively"""
X_train, y_train, X_test, y_test = dp.splitting_as_x_y(df_train_sc, df_test_sc,
'concrete_compressive_strength') # Splitting data into independent and dependent variables respectively
"""Experimenting with Linear regression model, feature selection using Backward elimination approach"""
lr = LinearRegressionWithFeatureSelection(X_train, y_train, X_test, y_test,
) # An object which is responsible for building Linear regression models
""" 1) Linear regression with Backward elimination approach """
lr_be, X_train_be, y_pred_train_be, X_test_be, y_pred_test_be, relevant_features_by_BE = lr.backward_elimination_approach()
# Linear regression model, its predictions using only the relevant features chose by backward elimination approach.
logging.info(
"Defining a function to record the relevant features of each model in the 'relevant_features_by_models.csv' file"
"in the 'results' directory")
# Blank dataframe to record the relevant features w.r.t. the algorithm used
imp_f = pd.DataFrame(columns=['Algorithm', 'Imp_Features'])
def rec_imp_features(dataframe, algorithm_name, imp_features):
"""Description: This function stores the important features of an algorithm we used, in a Pandas dataframe.
parameters
----------------------------------------
dataframe: A Pandas dataframe in which the results are recorded
algorithm_name: Algorithm we used
imp_features: Important features chose by the Algorithm
returns
----------------------------------------
Pandas dataframe containing the important features information w.r.t. the algorithm, saved to local disk as a
.csv file.
"""
try:
dataframe.loc[len(dataframe)] = [algorithm_name, imp_features]
logging.info(f"Important features of the {algorithm_name} updated in the 'relevant_features_by_models.csv' ")
return dataframe.to_csv('results/relevant_features_by_models.csv', index=False)
except Exception as e:
logging.error("Exception occurred in the function 'rec_imp_features'. "
"Exception message: " + str(e))
rec_imp_features(imp_f, "Linear Regression_BE", relevant_features_by_BE)
# calling the function to store the info of relevant features
metrics = Metrics() # instantiating the Metrics object
"""Storing the results of model performance"""
algo_results = pd.DataFrame(columns=['Algorithm', 'Train_R2 score', 'Train_Adj_R2 score', 'Train_RMSE score',
'Test_R2 score', 'Test_Adj_R2 score',
'Test_RMSE score']) # Data frame to record performance of every model
logging.info(
'Created a blank data frame "algo_results" to store the results of each and every model') # logging operation
logging.info('Defining a function "evaluate" to evaluate the performance metrics of a model') # logging operation
def evaluate(dataframe, algorithm_name,
x_train, y_train, y_pred_train,
x_test, y_test, y_pred_test): # Function to update the dataframe 'algo_results'
""" Description: This function stores the model's performance metrics and returns a pandas dataframe.
parameters
------------------------------
dataframe: A pandas dataframe to store the results of experiments on various algorithms
algorithm_name: Name of an algorithm w.r.t. its results
x_train: Training data with independent features
y_train: Training data with dependent feature i.e., actual values
y_pred_train: Predictions on the training data
x_test: Testing data with independent features
y_test: Testing data with dependent feature i.e, actual values
y_pred_test: Predictions on the testing data
returns
-----------------------------
Pandas dataframe containing the performance metrics of algorithms and saved to local disk as .csv file
"""
try:
dataframe.loc[len(dataframe)] = [algorithm_name, metrics.r2_score(y_train, y_pred_train),
metrics.adj_r2_score(x_train, y_train, y_pred_train),
metrics.rmse_score(y_train, y_pred_train),
metrics.r2_score(y_test, y_pred_test),
metrics.adj_r2_score(x_test, y_test, y_pred_test),
metrics.rmse_score(y_test, y_pred_test)]
logging.info(
f'Results Dataframe saved to disk as "Performance of algorithms.csv" file in the "results" directory. ')
logging.info("Updated the results in 'Performance of algorithms.csv'")
return dataframe.to_csv('results/Performance of algorithms.csv', index=False)
except Exception as e:
logging.error("Exception occurred in the function 'evaluate'. "
"Exception message: " + str(e))
evaluate(algo_results, 'Linear Regression_BE',
X_train_be, y_train, y_pred_train_be,
X_test_be, y_test, y_pred_test_be) # calling the function to store results into local disk
""" 2) Linear regression with RFE approach """
lr_rfe, X_train_rfe, y_pred_train_rfe, X_test_rfe, y_pred_test_rfe, relevant_features_by_RFE = lr.rfe_approach()
# Linear regression model, its predictions using the relevant features by the RFE method.
rec_imp_features(imp_f, "Linear Regression_RFE",
relevant_features_by_RFE) # storing the info of the relevant features
"""Storing the results of model performance """
evaluate(algo_results, 'Linear Regression_RFE',
X_train_rfe, y_train, y_pred_train_rfe,
X_test_rfe, y_test, y_pred_test_rfe) # calling the function "evaluate" to store the results into .csv file
""" 3) Linear regression with LassoCV approach """
ls = Lasso(X_train, y_train, X_test, y_test,
) # An object which is responsible for building Linear regression models with Lasso regularization.
lr_lasso, X_train_lasso, y_pred_train_lasso, X_test_lasso, y_pred_test_lasso = ls.lassocv() # Linear regression model and predictions of both the train and test data respectively
relevant_features_by_Lasso = ['cement', 'age', 'blast_furnace_slag', 'fly_ash', 'superplasticizer',
'fine_aggregate', 'water'] # relevant features
rec_imp_features(imp_f, "Linear Regression_Lasso",
relevant_features_by_Lasso) # storing the info of the relevant features
"""Storing the results of model performance """
evaluate(algo_results, 'Linear Regression_Lasso',
X_train_lasso, y_train, y_pred_train_lasso,
X_test_lasso, y_test, y_pred_test_lasso) # calling the function "evaluate" to store the results into local disk
"""As per the results, in the "Experiments with algorithms.csv", all the 3 approaches of linear regression
delivering the similar results. Linear Regression_RFE approach is having very low rmse score on the test set,
so we can consider this as a best model among the three.
And after conducting the residual analysis using this model in the jupyter notebook "EDA + Model building.ipynb" it is evident
that, it satisfies all the Linear regression assumptions. """
"""But all the above 3 variants of Linear regression shows slight under-fit. Let's also experiment with different tree models."""
"""Splitting datasets into independent and dependent datasets respectively"""
X_train, y_train, X_test, y_test = dp.splitting_as_x_y(df_train, df_test,
'concrete_compressive_strength')
# Tree models do not require feature scaling, hence we are using the original data
t = TreeModelsReg(X_train, y_train, X_test, y_test,
) # An object which is responsible for building tree based models
""" 1) Decision Tree Regressor """
# logging operation
logging.info(
"Building a Decision tree regressor model on the training data and the importance of each feature will be displayed "
"in the console.")
dt_model = t.decision_tree_regressor() # Decision tree regressor model
top_features_dt = ['cement', 'age', 'water', 'blast_furnace_slag'] # top 4 features as per the feature importance
rec_imp_features(imp_f, "Decision tree regressor", top_features_dt) # storing the top features data in a file
X_train_dt = X_train[top_features_dt] # considering only the relevant features
X_test_dt = X_test[top_features_dt]
t = TreeModelsReg(X_train_dt, y_train, X_test_dt, y_test,
) # An object which is responsible for building tree based models
# logging operation
logging.info("Building a Decision tree regressor model on the training data with the relevant features only")
dt_model_2 = t.decision_tree_regressor() # building a decision tree regressor model
y_pred_train_dt = t.model_predict(dt_model_2, X_train_dt) # predictions on the training data
y_pred_test_dt = t.model_predict(dt_model_2, X_test_dt) # predictions on the testing data
logging.info(
'Using Decision tree regressor model, successfully made predictions on both the training and testing data respectively')
"""Storing the results of model performance """
evaluate(algo_results, 'Decision tree regressor',
X_train_dt, y_train, y_pred_train_dt,
X_test_dt, y_test, y_pred_test_dt) # calling the function "evaluate" to store the results into local disk.
""" Result:- Model over-fit.
Let's try with post pruning technique to reduce over-fitting in the decision tree"""
# logging operation
logging.info(
"Building a Decision tree regressor model with post pruning technique. Considering only the relevant features.")
rec_imp_features(imp_f, "Decision tree regressor_post pruning",
top_features_dt) # storing the top features data in a file
dt_model_pp = t.decision_tree_regressor_post_pruning() # building a decision tree regressor by post pruning technique
y_pred_train_dt_pp = t.model_predict(dt_model_pp, X_train_dt) # predictions on the training data
y_pred_test_dt_pp = t.model_predict(dt_model_pp, X_test_dt) # predictions on the testing data
logging.info(
'Using Decision tree regressor_post pruning model, successfully made predictions on both the training and testing data respectively')
"""Storing the results of model performance """
evaluate(algo_results, 'Decision tree regressor_post pruning',
X_train_dt, y_train, y_pred_train_dt_pp,
X_test_dt, y_test, y_pred_test_dt_pp) # calling the function "evaluate" to store the results into local disk
"""Result: Over-fitting issue resolved and got a better model compared to the previous one"""
""" Let's experiment with some ensemble techniques as well to get even better R2 scores"""
""" 2) Random forest regressor """
t = TreeModelsReg(X_train, y_train, X_test, y_test,
) # An object which is responsible for building tree based models
# logging operation
logging.info(
"Building a Random forest regressor model on the training data and the importance of each feature will be displayed "
"in the console.")
rf_model = t.random_forest_regressor() # building a random forest regressor model
top_features_rf = ['age', 'cement', 'water', 'blast_furnace_slag',
'superplasticizer'] # top 5 features as per the feature importance
rec_imp_features(imp_f, "Random Forest regressor", top_features_rf) # storing the relevant features in a file.
X_train_rf = X_train[top_features_rf] # considering only the relevant features
X_test_rf = X_test[top_features_rf]
t = TreeModelsReg(X_train_rf, y_train, X_test_rf, y_test,
) # An object which is responsible for building tree based models
# logging operation
logging.info(
"Building a Random forest regressor model on the training data with the relevant features only.")
rf_model_2 = t.random_forest_regressor() # building a random forest regressor model
y_pred_train_rf = t.model_predict(rf_model_2, X_train_rf) # predictions on the training data
y_pred_test_rf = t.model_predict(rf_model_2, X_test_rf) # predictions on the testing data
logging.info(
'Using Random Forest Regressor model, successfully made predictions on both the training and testing data respectively')
"""Storing the results of model performance """
evaluate(algo_results, 'Random Forest regressor',
X_train_rf, y_train, y_pred_train_rf,
X_test_rf, y_test, y_pred_test_rf) # calling the function "evaluate" to store the results into local disk
""" Result:- Random forest regressor model looks far more better than the Decision tree regressor.
# Let's try with Boosting algorithms as well and see whether we get any even better scores"""
""" 3) AdaBoost Regressor """
t = TreeModelsReg(X_train, y_train, X_test, y_test,
) # An object which is responsible for building tree based models
# logging operation
logging.info(
"Building an Adaboost regressor model on the training data and the importance of each feature will be displayed "
"in the console.")
adb_model = t.adaboost_regressor() # building an Adaboost regressor model
top_features_adb = ['cement', 'age', 'water', 'blast_furnace_slag', 'superplasticizer', 'fine_aggregate ',
'coarse_aggregate'] # Top features by the Adaboost regressor
rec_imp_features(imp_f, "Adaboost regressor", top_features_adb) # storing the relevant features in the file.
X_train_adb = X_train[top_features_adb] # considering only the relevant features
X_test_adb = X_test[top_features_adb]
t = TreeModelsReg(X_train_adb, y_train, X_test_adb, y_test,
) # An object which is responsible for building tree based models)
logging.info(
"Building an Adaboost regressor model on the training data with the relevant features only.")
adb_model_2 = t.adaboost_regressor() # building an Adaboost regressor model
y_pred_train_adb = t.model_predict(adb_model_2, X_train_adb) # predictions on the training data
y_pred_test_adb = t.model_predict(adb_model_2, X_test_adb) # predictions on the testing data
logging.info(
'Using Adaboost Regressor model, successfully made predictions on both the training and testing data respectively')
"""Storing the results of model performance """
evaluate(algo_results, 'Adaboost regressor',
X_train_adb, y_train, y_pred_train_adb,
X_test_adb, y_test, y_pred_test_adb) # calling the function "evaluate" to store the results into local disk.
""" Result:- Model accuracy is less that the random forest regressor. Let's experiment with Gradient boosting regressor."""
""" 4) Gradient Boosting Regressor """
t = TreeModelsReg(X_train, y_train, X_test, y_test,
) # An object which is responsible for building tree based models
# logging operation
logging.info(
"Building a Gradient Boosting regressor model on the training data and the importance of each feature will be displayed "
"in the console.")
gbr_model = t.gradientboosting_regressor() # building a gradient boosting regressor
top_features_gbr = ['age', 'cement', 'water', 'blast_furnace_slag'] # top 4 features by the Gradient boosting regressor
rec_imp_features(imp_f, "Gradient Boost regressor", top_features_gbr) # storing the relevant features in the file.
X_train_gbr = X_train[top_features_gbr] # considering only the relevant features
X_test_gbr = X_test[top_features_gbr]
t = TreeModelsReg(X_train_gbr, y_train, X_test_gbr, y_test,
) # An object which is responsible for building tree based models)
# logging operation
logging.info("Building a Gradient Boosting regressor model on the training data with the relevant features only")
gbr_model_2 = t.gradientboosting_regressor() # building a Gradient boosting regressor model
y_pred_train_gbr = t.model_predict(gbr_model_2, X_train_gbr) # predictions on the training data
y_pred_test_gbr = t.model_predict(gbr_model_2, X_test_gbr) # predictions on the testing data
logging.info(
'Using Gradient Boosting Regressor model, successfully made predictions on both the training and testing data respectively')
"""Storing the results of model performance """
evaluate(algo_results, 'Gradient Boost regressor',
X_train_gbr, y_train, y_pred_train_gbr,
X_test_gbr, y_test, y_pred_test_gbr) # calling the function "evaluate" to store the results into local disk
""" Result:- Compared to Adaboost regressor, Gradient Boost Regressor has performed well. Let's check with XGBoost regressor as well."""
""" 5) XGBoost Regressor """
t = TreeModelsReg(X_train, y_train, X_test, y_test,
) # An object which is responsible for building tree based models
# logging operation
logging.info(
"Building an XGBoost regressor model on the training data and the importance of each feature will be displayed "
"in the console.")
xgbr_model = t.xgb_regressor() # building an XGBoost regressor model
top_features_xgbr = ['age', 'cement', 'water', 'fly_ash', 'superplasticizer',
'blast_furnace_slag'] # top features by the XGBoost regressor
rec_imp_features(imp_f, "XGBoost regressor", top_features_xgbr) # storing the relevant features into file.
X_train_xgbr = X_train[top_features_xgbr] # considering only the relevant features
X_test_xgbr = X_test[top_features_xgbr]
t = TreeModelsReg(X_train_xgbr, y_train, X_test_xgbr, y_test,
) # An object which is responsible for building tree based models)
# logging operation
logging.info("Building an XGBoost regressor model on the training data with the relevant features only. ")
xgbr_model_2 = t.xgb_regressor() # building a XGBoost regressor model
y_pred_train_xgbr = t.model_predict(xgbr_model_2, X_train_xgbr) # predictions on the training data
y_pred_test_xgbr = t.model_predict(xgbr_model_2, X_test_xgbr) # predictions on the testing data
logging.info(
'Using XGBoost regressor model, successfully made predictions on both the training and testing data respectively')
"""Storing the results of model performance """
evaluate(algo_results, 'XGBoost regressor',
X_train_xgbr, y_train, y_pred_train_xgbr,
X_test_xgbr, y_test, y_pred_test_xgbr) # calling the function "evaluate" to store the results into local disk.
""" Result:- As per the results recorded in the "Experiments with algorithms.csv" ,
XGBoost Regressor is the best one in terms of adjusted R2 score on testing data, followed by Random Forest regressor """
logging.info("Best Model: XGBoost regressor ") # logging operation
# saving the best models into disk
logging.info('Saving the XGBoost regressor model into the "models" directory') # logging operation
joblib.dump(xgbr_model_2, 'models\XGBoost_Regressor_model.pkl')
logging.info('Saving the Random Forest regressor model into the "models" directory') # logging operation
joblib.dump(rf_model_2, 'models\RandomForest_Regressor_model.pkl')
logging.info(" Solution development part completed successfully. Thank you _/\_ , Stay safe and healthy :-) ")
# """ With this, Development part is completed. Moving on to the Deployment part in the app.py file """