-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathmyapp.py
1261 lines (1150 loc) · 73.4 KB
/
myapp.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
import streamlit as st
st.set_option('deprecation.showPyplotGlobalUse', False)
from PIL import Image
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from matplotlib import cm
import seaborn as sns
import pickle
import time
from io import BytesIO
import requests
import os
from math import log, exp, sqrt
from pickle import load
import math
import dash_daq as daq
from sklearn.metrics.classification import accuracy_score, log_loss
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.linear_model import SGDClassifier
from sklearn.calibration import CalibratedClassifierCV
from sklearn.metrics import precision_recall_fscore_support as score
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
#import tensorflow library
from tensorflow.keras.callbacks import ModelCheckpoint
import tensorflow as tf
from tensorflow.keras.models import *
from tensorflow.keras.layers import *
from tensorflow.keras import backend as K
from tensorflow import keras
from tensorflow.keras.optimizers import Adam
from sklearn.preprocessing import MinMaxScaler
image = Image.open('Turbofan.jpg')
st.set_page_config(
page_title="RUL",
page_icon=":shark:",
layout="centered",
initial_sidebar_state="auto")
Data_url_train = 'https://raw.githubusercontent.com/Ali-Alhamaly/Turbofan_usefull_life_prediction/master/Data/Simulation_Data/train_FD002.txt'
Data_url_test = 'https://raw.githubusercontent.com/Ali-Alhamaly/Turbofan_usefull_life_prediction/master/Data/Simulation_Data/test_FD002.txt'
@st.cache
def load_data(URL):
index_columns_names = ["Engine_No","Time_in_cycles"]
operational_settings_columns_names = ['Altitude','Mach_number','TRA']
sensor_measure_columns_names = ['T2','T24','T30','T50', 'P2','P15','P30', 'Nf','Nc','epr','Ps30','phi','NRf','NRc','BPR','fuel_air_ratio','htBleed','Nf_dmd','PCNfR_dmd','W31','W32']
input_file_column_names = index_columns_names + operational_settings_columns_names + sensor_measure_columns_names
df= pd.read_csv(URL,delim_whitespace=True,names=input_file_column_names,error_bad_lines=False)
# convert killo feet to meters
df['Altitude']= df['Altitude'].apply(lambda x: x*1000*0.3048)
#convert temperature Rankine to Kelvin
df['T2'] = df['T2'].apply(lambda x: x*0.555556)
#convert temperature Rankine to Kelvin
df['T24'] = df['T24'].apply(lambda x: x*0.555556)
#convert temperature Rankine to Kelvin
df['T30'] = df['T30'].apply(lambda x: x*0.555556)
#convert temperature Rankine to Kelvin
df['T50'] = df['T50'].apply(lambda x: x*0.555556)
# convert pressure psia to bar
df['P2'] = df['P2'].apply(lambda x: x*0.0689476)
# convert pressure psia to bar
df['P15'] = df['P15'].apply(lambda x: x*0.0689476)
# convert pressurepsia to bar
df['P30'] = df['P30'].apply(lambda x: x*0.0689476)
# convert pressure to bar
df['Ps30'] = df['Ps30'].apply(lambda x: x*0.0689476)
return df
@st.cache
def RUL(df):
rul = pd.DataFrame(df.groupby('Engine_No')['Time_in_cycles'].max()).reset_index() # Data Labeling - generate column RUL
rul.columns = ['Engine_No', 'max_No_cycles']
df= df.merge(rul, on=['Engine_No'], how='left')
df['RUL'] = df['max_No_cycles'] - df['Time_in_cycles']
df.drop('max_No_cycles', inplace=True, axis=1)
return df
def load_homepage() -> None:
st.title('Estimate Remaining Useful Life of Turbofan engines')
st.image(image, caption='Photographer: Luke Sharrett/Bloomberg', use_column_width=True)
st.write("""
One of the main challenges in the aviation industry is to reduce maintenance costs and reduce downtime of machines by maintaining or improving safety standard. A large per cent of these costly delays are a result of unplanned maintenance such as when an aircraft has an abnormal behaviour on the previous flight, creating an operational disruption and even requiring an aircraft change.
Since you don't know when failure will occur, you have to be conservative in your planning, especially if you're operating safety-critical equipment like engines. But by scheduling maintenance very early, you're wasting machine life or spare part life that is still usable, and this adds to costs to the owner.
However, if you can predict when machine failure will occur, you can schedule maintenance right before it. By implementing predictive maintenance we can minimize unnecessarily scheduling maintenance and production hours lost during maintenance (improve the overall availability of equipment) and reduce the cost of spare parts and its consumables during the maintenance process.
In order to develop an algorithm that predicts the breakdown of a piece of equipment within a given time window (typically some number of days), we require enough historical data that allows us to capture information about events leading to failure.""")
st.subheader('About Data set:')
st.write("""
In this post, we are using the C-MAPSS dataset, which is engine degradation simulation was carried out at NASA by using C-MAPSS simulation software. C-MAPSS has created four different data sets simulated under different combinations of operational conditions and fault modes.
Data sets are consist of three operational settings and 21 sensor measurements (temperature, pressure, fan speed, etc.) for several engines and for every cycle of their lifetime.
*The engine is operating normally at the start of each time series and develops a fault at some point during the series.**In the training set, the fault grows in magnitude until system failure. In the test set, the time series ends sometime prior to system failure**.*""")
st.subheader("Problem statement:")
st.write("The goal of predictive maintenance is to predict at the time t, using the data up to that time, whether the equipment will fail in the near future.")
st.write("This problem is can be formulated in two ways:")
st.write("**Classification:** we are aims to predict the probability that the equipment will fail within a pre-specified time window.")
st.write("**Regression:** A regression-based approach to which aims to estimate the remaining time to the end of the equipment's useful life of an engine.")
st.write("In this blog, we'll focus on the second dataset (FD002) in which all engines develop some fault with six operating conditions.")
def EDA():
st.title('Data Exploration')
if st.checkbox("Show Train dataset"):
load_stats =st.text("Loading Train data..............")
df_train = load_data(Data_url_train)
load_stats.text("Loading Train data....done!")
st.subheader('Train data')
st.text("All values are conversion in SI system")
st.write(df_train.head(10))
if st.checkbox("Show Test dataset"):
load_stats.text("Loading Test data..............")
df_Test = load_data(Data_url_test)
load_stats.success(" Done ")
st.subheader('Test data')
st.text("All values are conversion in SI system")
st.write(df_Test.head(10))
st.write("""Before we start EDA, we need to compute a target variable for this data set which is RUL.""")
st.write("""(RUL)Remaining Useful Life(the length of time a machine is likely to operate before it requires repair or replacement.""")
st.write("""As mention about the definition,
RUL = a total number of the life cycle engine - Current engine life cycle """)
st.write("""To calculate RUL in our training data set. we just did group by engine number(max number of cycles) and present engine life cycle.""")
st.write("""For test data, the time series ends sometime prior to system failure, after that we don't have data and we have ground-truth data (The true remaining cycles for each engine in the testing data).
Example: let consider the engine one life cycle data. In a training data life cycle of engine start from 1 and end 223. In test data, the engine runs some cycle like 120, we need to find out how many cycles still remain.
To calculate RUL in our test data set, we did the same thing as training data by adding ground-truth data value to test data.""")
df_train = load_data(Data_url_train)
train = RUL(df_train)
if st.checkbox("Show code"):
st.write('code for to calculate RUL')
code = '''rul = pd.DataFrame(df_train.groupby('Engine_No')['Time_in_cycles'].max()).reset_index() # Data Labeling - generate column RUL
rul.columns = ['Engine_No', 'max_No_cycles']
train = df_train.merge(rul, on=['Engine_No'], how='left')
train['RUL'] = train['max_No_cycles'] - train['Time_in_cycles']
####################################
truth_df = pd.read_csv('https://raw.githubusercontent.com/Ali-Alhamaly/Turbofan_usefull_life_prediction/master/Data/Simulation_Data/RUL_FD002.txt',delim_whitespace=True, header=None)
truth_df.columns = ['RUL']
# generate column max for test data
rul = pd.DataFrame(df_Test.groupby('Engine_No')['Time_in_cycles'].max()).reset_index()
rul.columns = ['Engine_No', 'max_No_cycles']
truth_df['Engine_No'] = truth_df.index + 1
truth_df['max_No_cycles'] = rul['max_No_cycles'] + truth_df['RUL']
truth_df.drop('RUL', axis=1, inplace=True)
# generate RUL for test data
Test = df_Test.merge(truth_df, on=['Engine_No'], how='left')
Test['RUL'] = Test['max_No_cycles'] - Test['Time_in_cycles']'''
st.code(code, language='python')
st.write("Data with RUL calculation")
st.write(train.head(5))
st.write('''Since the degradation in a system will generally remain negligible until after some period of the operation time, the early and higher RUL values are probably unreasonable. We can't really say anything about the RUL before that point because we have no information about the initial wear and tear.''')
if st.checkbox("Show Sketch of the RUL of Engines"):
st.write('''Instead of having our RUL decline linearly, we define our RUL to start out as a constant and only decline linearly after some time''')
#rul_clip_limit
rul_clip_limit = 125
y_train= train[train['Engine_No']==10].RUL.reset_index(drop=True)
y_train_clip= y_train.clip(upper =rul_clip_limit)
fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(8,3))
axes[0].plot(y_train)
axes[0].set_title('Engine RUL before the Clipping')
axes[1].plot(y_train_clip)
axes[1].set_title('Engine RUL after the Clipping')
st.pyplot(fig)
st.header('**Exploratory Data Analysis:**')
st.bar_chart(train.Engine_No.value_counts(),use_container_width=True)
st.write("- Engines have different life durations.The average working time in train data is 206 cycles with a minimum of 128 and a maximum of 378.")
st.write("- 96% of engines are within 300 life cycle and 4% only engines lived above 300 life cycles, this engines we consider as an extreme case.")
st.subheader('**Operational Setting:**')
option_1 = st.selectbox('What column to you want to display',['Altitude', 'Mach_number', 'TRA'])
fig = px.scatter(train.reset_index(),x='index',y = str(option_1))
st.plotly_chart(fig)
st.write("""As plots illustrate, all engines are operated six different altitude conditions with a range of (0–12000M.) and with 5 different operating speed(Mach number) of (0–0.84), with two throttle position (TRA) (60–100).""")
st.subheader('**sensors data:**')
option = st.multiselect('What column to you want to display',train.columns.difference(['Engine_No', 'Time_in_cycles', 'Altitude', 'Mach_number', 'TRA']))
@st.cache
def sensor_data(Engine_No=1):
df = train[train['Engine_No'] == 1]
min_max= MinMaxScaler()
cols = df.columns.difference(['Engine_No', 'Time_in_cycles', 'Altitude', 'Mach_number', 'TRA'])
norm_train_df = pd.DataFrame(min_max.fit_transform(df[cols]), columns=cols, index=df.index)
join_df = df[df.columns.difference(cols)].join(norm_train_df)
Train= join_df.reindex(columns = df.columns)
return df
df = sensor_data()
st.line_chart(df[option])
st.write("- sensors data remain same throughout engine's life.")
st.write("- we understand that sensors data T2,T24,T30,T50,P2,P15,P30 highly correlated with one another (Based on Temperature and pressure graph look like sames).")
st.write("- sensors Nf and Nc also correlated and also W31 and W32")
st.write('- As we can see, all the sensors data is higly correlated. even data is highly correlated we are keeping this features.')
st.write("Let's plot PDF individual sensors for any given engines.")
option_2 = st.multiselect('which engine you want comparison',train.Engine_No.unique())
# Radio Buttons
status = st.radio("Plot PDF of belows sersors data ",("Pressure sensors","Temperature sensors","Speed sensors","Other parameters"))
if status =="Pressure sensors":
option_3= st.multiselect('Select from Pressure sensors',['P2','P15','P30','Ps30'],['P2','P15'])
elif status == "Temperature sensors":
option_3= st.multiselect('Select from temperature sensors',['T2','T24','T30','T50'],['T2','T50'])
elif status == "Speed sensors":
option_3= st.multiselect('Select from Speed sensors',['Nf','Nc','NRf','NRc','Nf_dmd','PCNfR_dmd','W31','W32'],['W31','W32'])
else:
option_3= st.multiselect('Select from Other parameters',['epr','phi','BPR','fuel_air_ratio','htBleed'],['BPR','fuel_air_ratio'])
df= train.loc[train['Engine_No'].isin(option_2)].reset_index(drop=True)
fig = sns.pairplot(df, vars=option_3, hue="Engine_No")
st.pyplot(fig)
for i in option_2:
st.write('Engine number is',i, ', total number of engine cycle(RUL) is', df[df['Engine_No']==i].shape[0])
st.write("when we are comparing this engines sensors data,we can't tell differentiate between this engines by looking into PDF plots for this data set")
def FE():
st.title('Feature engineering')
st.write('Based on my domain knowledge i come up with 40 new features which can improve model performance.This features are performance parameters of turbofan engines like, Thrust,Propulsion efficiency,Thermal efficiency,Turbine Entry Temperature(TET),etc.....')
image = Image.open('C-MAPSS.jpg')
st.image(image, caption='From C-MAPSS paper', use_column_width=True)
st.write("Schematic diagram showing the components,station numbers and sensor data taken from of the C-MAPSS engine.A brief description some of the major modules is summarized below.")
st.write("**Exhaust Gas Temperature (EGT)(T90):** expressed in Kelvin, is the temperature at the engine exhaust and a measure of an engine’s efficiency in producing its design level thrust; the higher the EGT the more wear and deterioration affect an engine. High EGT can be an indication of degraded engine performance. An exceedance in EGT limits can lead to immediate damages of engine parts and/or a life reduction of engine parts. With this in mind it then becomes absolutely important to keep the EGT as low as possible for as long as possible.")
st.write ("**Turbine Inlet Temperature(T41):** The general trend is that raising the turbine inlet temperature increases the specific thrust of the engines with a small increase in fuel consumption rate.However, high temperatures can damage the turbine, as the blades are under large centrifugal stresses and materials are weaker at high temperature.")
st.write("**Thermal efficiency** is a prime factor in turbine performance.The three most important factors affecting the thermal efficiency are turbine inlet temperature(T41), compression ratio(P30/P21), and the component efficiencies of the compressor and turbine. Other factors that affect thermal efficiency are compressor inlet temperature(P21) and combustion efficiency.Maximum thermal efficiency can be obtained by maintaining the highest possible exhaust temperature.since engine life is greatly reduced at high turbine inlet temperatures, the operator should not exceed the exhaust temperatures specified for continuous operation.")
link_1 = "All this feature are created using given sensor data with some assumptions. I am not going to discuss all the feature hear. Please ref to [[1]](http://www.aircraftmonitor.com/uploads/1/5/9/9/15993320/engine_mx_concepts_for_financiers___v2.pdf),[[2]](https://www.flight-mechanic.com/gas-turbine-engine-performance/)"
st.markdown(link_1, unsafe_allow_html=True)
link_2= 'More about my Feature engineering [GitHub](https://github.com/gethgle/Predictive-maintenance-of-turbofan-engines)'
st.markdown(link_2, unsafe_allow_html=True)
st.write('**Improvement:**')
st.write("- From EDA we understand that data set having different operating conditions,so we can create clustering and do labeling on top that create onehotencoder but this only work on FD002 and FD004 datasets")
st.write("- If possible we can create feature engineering with autoencoder.")
st.write('- If we can remove noise from data, we can achieve good results.Let’s try noise reduction effectively by using the autoencoder or other noise reduction techniques')
def MS():
st.title("Model Building")
st.write("As mention early in home page we are going to solve both regression and classification.")
st.write("After completing feature engineering, we got total 63 feature and applied random forest algorithm to select feature and we got 23 feature.")
st.write("We have 4 different data set with different operating condation.Please Select data set below.")
data_set_type= st.selectbox("Please choose a Data set Type", ("FD001", "FD002","FD003", "FD004"))
if data_set_type == "FD001":
url_train = 'https://raw.githubusercontent.com/gethgle/predictive-maintenance/main/data/output/FD001/Feature_Engg_Train_F0001.csv'
url_test = 'https://raw.githubusercontent.com/gethgle/predictive-maintenance/main/data/output/FD001/Feature_Engg_Test_F0001.csv'
elif data_set_type == "FD002":
url_train = 'https://raw.githubusercontent.com/gethgle/predictive-maintenance/main/data/output/FD002/Feature_Engg_Train_F0002.csv'
url_test = 'https://raw.githubusercontent.com/gethgle/predictive-maintenance/main/data/output/FD002/Feature_Engg_Test_F0002.csv'
elif data_set_type == "FD003":
url_train = 'https://raw.githubusercontent.com/gethgle/predictive-maintenance/main/data/output/FD003/Feature_Engg_Train_F0003.csv'
url_test = 'https://raw.githubusercontent.com/gethgle/predictive-maintenance/main/data/output/FD003/Feature_Engg_Test_F0003.csv'
else:
url_train = 'https://raw.githubusercontent.com/gethgle/predictive-maintenance/main/data/output/FD004/Feature_Engg_Train_F0004.csv'
url_test = 'https://raw.githubusercontent.com/gethgle/predictive-maintenance/main/data/output/FD004/Feature_Engg_Test_F0004.csv'
st.subheader("Scaling:")
st.write("Sklearns’ MinMaxScaler can be used to create a scaler fitted to our train data. The default settings create a scaler to scale our training features between 0–1. The scaler is then applied to both our x_train and x_test set.")
imp_col = ['RUL','Engine_No','T50','Nf','Ps30','phi','NRf','NRc','BPR','W31','W32','T48','T41','T90','Ve','EGT_margin','Nc/Nf','PCNcRdmd ','M_cold','W_f','Thrust','Fan_thrust','core_thrust','TSFC','Thermal_efficiency']
@st.cache(allow_output_mutation=True)
def load_feature(URL):
df= pd.read_csv(URL,usecols =imp_col)
return df
Train_feature = load_feature(url_train)
Test_feature= load_feature(url_test)
period=30
Train_feature['label'] = Train_feature['RUL'].apply(lambda x: 1 if x <= period else 0)
Test_feature['label'] = Test_feature['RUL'].apply(lambda x: 1 if x <= period else 0)
min_max= MinMaxScaler()
cols = Train_feature.columns.difference(['RUL','label','Engine_No'])
norm_train_df = pd.DataFrame(min_max.fit_transform(Train_feature[cols]), columns=cols, index=Train_feature.index)
join_df = Train_feature[Train_feature.columns.difference(cols)].join(norm_train_df)
Train= join_df.reindex(columns = Train_feature.columns)
norm_train_df = pd.DataFrame(min_max.transform(Test_feature[cols]), columns=cols, index=Test_feature.index)
join_df = Test_feature[Test_feature.columns.difference(cols)].join(norm_train_df)
Test= join_df.reindex(columns = Test_feature.columns)
st.write("Shape of train data :",Train.shape)
st.write("Shape of test data :",Test.shape)
st.subheader("Data processing:")
st.write("By default we are having Train and Test datasets.we keep Train data for train and split test data into cross_validation and test data, we pick the last sequence data points as test data,remaining data points as cross_validation.")
st.write("we are taken RUL and Label columns as dependent variable, remaining 23 columns as independent variable and remaining one columns is Engine_No column.")
st.write("For classification approach, generating the label by ourself in this way, from 0 to 30 cycles, we’ve labeled as 1(fail) and rest (>30) as 0(Not fail). For regression problem we keep RUL columns without any changes.")
st.write(Train.head())
@st.cache
def splite_data(sequence_length,data,cross_val=True):
val_data=[]
for id in data['Engine_No'].unique():
if len(data[data['Engine_No']==id])>= sequence_length:
if cross_val:
# Remaining data points as cross_validation(other than last sequence data points).
val_data.append(data[data['Engine_No']==id][:-sequence_length])
else:
# We pick the last sequence data points as test data
val_data.append(Test[Test['Engine_No']==id][-sequence_length:])
else:
False
return pd.concat(val_data)
st.write("After split data,we process data for machine learning and deep learing algorithm little differently which is followed below.")
#This function plots the confusion matrices given y_i, y_i_hat.
st.subheader("**Choose the algorithm**")
ML_type= st.selectbox("Please choose a Model Type", ("Machine learning", "Deep learning"))
@st.cache
def RUL_line(Y_ture, Y_pred):
dataset = pd.DataFrame({'Ture_RUL':Y_ture, 'Predic_RUL': Y_pred}, columns=['Ture_RUL', 'Predic_RUL'])
return dataset
def plot_confusion_matrix(test_x, test_y, clf):
pred_y = sig_clf.predict(test_x)
st.write("-"*20, "Confusion matrix", "-"*20)
cm = confusion_matrix(test_y,pred_y)
fig, ax = plt.subplots(figsize=(3,3))
ax = sns.heatmap(cm,annot=True, cmap='Blues')
fig.suptitle('Confusion matrix', fontsize=5)
plt.xlabel('Predicted Class', fontsize=5)
plt.ylabel('Original Class', fontsize=5)
st.pyplot(fig)
# calculating the number of data points that are misclassified
st.write("Number of mis-classified points :", (np.count_nonzero((pred_y- test_y))/test_y.shape[0])*100)
precision, recall, fscore, support = score(test_y,pred_y)
st.write('precision:', precision[0])
st.write('recall:',recall[0])
st.write('fscore:',fscore[0])
st.write('Negative class labels:',support[0],'positive class labels:',support[1])
if ML_type == "Machine learning":
Algo_type = st.selectbox("Please choose a Problem Type", ("Classification", "Regression"))
if Algo_type == 'Classification':
st.write("For machine learning algorithm we send data to algorithm as a individual data point.")
# cross_val is other than last sequence data points (test data 41214 = cross_val(40966)+test(248))
cross = splite_data(1,Test,cross_val=True)
# we have 248 engine data,# We pick the last sequence data points as test data
Test_data = splite_data(1,Test,cross_val= False)
y_train = Train['label']
x_train = Train.drop(['RUL','label','Engine_No'],axis = 1)
y_cross_val = cross['label']
x_cross_val = cross.drop(['RUL','label','Engine_No'],axis = 1)
y_test = Test_data['label'].reset_index(drop=True)
x_test = Test_data.drop(['RUL','label','Engine_No'],axis = 1).reset_index(drop=True)
st.write("Shape of Train data after split:",x_train.shape)
st.write("Shape of Train labeling data :",y_train.shape)
st.write("Shape of Cross validation data after split:",x_cross_val .shape)
st.write("Shape of Cross validation labeling data :",y_cross_val.shape)
st.write("Shape of Test data after split:",x_test.shape)
st.write("Shape of Test labeling data :",y_test.shape)
if st.checkbox("Show code for ML data processing"):
st.write('code for to Data processing:')
data_code = '''
def splite_data(sequence_length,data,cross_val=True):
val_data=[]
# we iterating through each engine number
for id in data['Engine_No'].unique():
if len(data[data['Engine_No']==id])>= sequence_length:
if cross_val:
# Remaining data points as cross_validation(other than last sequence data points).
val_data.append(data[data['Engine_No']==id][:-sequence_length])
else:
# We pick the last sequence data points as test data
val_data.append(Test[Test['Engine_No']==id][-sequence_length:])
else:
False
return pd.concat(val_data)'''
st.code(data_code, language='python')
select_model = st.selectbox("Please choose a Model from list", ("Logistic regression", "Linear SVM","RF Classifier"))
if select_model == "Logistic regression":
st.write("Let’s train",select_model,"on", data_set_type, "data set")
st.sidebar.markdown("----")
C= st.sidebar.select_slider('select C parameter to controls the penality strength',options=[10 ** x for x in range(-7, 1)])
st.sidebar.write("Select C values is", C)
Regue = st.sidebar.selectbox("Select regularization",['none', 'l1', 'l2', 'elasticnet'])
st.sidebar.write("Select regularizer is",Regue)
#sol = st.sidebar.selectbox("Select solvers to convergence ",['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'])
@st.cache
def train_model(C=1e-06,Regue = "l2"):
clf = SGDClassifier(alpha=C, penalty= Regue, loss='log', random_state=42)
clf.fit(x_train, y_train)
sig_clf = CalibratedClassifierCV(clf, method="sigmoid")
sig_clf.fit(x_train, y_train)
return sig_clf
st.sidebar.write("Train",select_model)
if st.sidebar.button("Train model"):
with st.spinner("Training ongoing......"):
sig_clf= train_model(C=C,Regue = Regue)
else:
sig_clf= train_model(C=1e-06,Regue = "l2")
sig_clf.fit(x_train, y_train)
predict_y = sig_clf.predict_proba(x_train)
st.write("The train log loss is:",log_loss(y_train, predict_y, labels=[0,1], eps=1e-15))
predict_y = sig_clf.predict_proba(x_cross_val)
st.write("The cross validation log loss is:",log_loss(y_cross_val, predict_y, labels=[0,1], eps=1e-15))
predict_y = sig_clf.predict_proba(x_test)
st.write("The test log loss is:",log_loss(y_test, predict_y, labels=[0,1], eps=1e-15))
plot_confusion_matrix(x_test,y_test,sig_clf)
elif select_model == "Linear SVM":
st.write("Let’s train",select_model,"on", data_set_type, "data set")
st.sidebar.markdown("----")
C= st.sidebar.select_slider('select C parameter to controls the penality strength',options=[10 ** x for x in range(-7, 3)])
st.sidebar.write("Select C values is", C)
Regue = st.sidebar.selectbox("Select regularization",['none', 'l1', 'l2', 'elasticnet'])
st.sidebar.write("Select regularizer is",Regue)
#sol = st.sidebar.selectbox("Select solvers to convergence ",['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'])
@st.cache(allow_output_mutation=True)
def train_model(C=1e-06,Regue = "l2"):
clf = SGDClassifier(alpha=C, penalty= Regue, loss='hinge', random_state=42)
clf.fit(x_train, y_train)
sig_clf = CalibratedClassifierCV(clf, method="sigmoid")
sig_clf.fit(x_train, y_train)
return sig_clf
st.sidebar.write("Train",select_model)
if st.sidebar.button("Train model"):
with st.spinner("Training ongoing......"):
sig_clf= train_model(C=C,Regue = Regue)
else:
sig_clf= train_model(C=1e-06,Regue = "l2")
sig_clf.fit(x_train, y_train)
predict_y = sig_clf.predict_proba(x_train)
st.write("The train log loss is:",log_loss(y_train, predict_y, labels=[0,1], eps=1e-15))
predict_y = sig_clf.predict_proba(x_cross_val)
st.write("The cross validation log loss is:",log_loss(y_cross_val, predict_y, labels=[0,1], eps=1e-15))
predict_y = sig_clf.predict_proba(x_test)
st.write("The test log loss is:",log_loss(y_test, predict_y, labels=[0,1], eps=1e-15))
plot_confusion_matrix(x_test,y_test,sig_clf)
else:
st.write("Let’s train",select_model,"on", data_set_type, "data set")
st.sidebar.markdown("----")
n_estimators = st.sidebar.select_slider('select number of estimators',options=[5,10,15,20,25,30,35,40,45,50,75,100,200,500,1000])
st.sidebar.write("Selected number of estimators are ",n_estimators)
max_depth = st.sidebar.select_slider('select max_depth of tree',options=[5,10,15,20,25,30,35])
st.sidebar.write("Selected tree depth is ",max_depth)
@st.cache(allow_output_mutation=True)
def rf_model(n_estimators=500,max_depth=10):
clf = RandomForestClassifier(n_estimators=n_estimators, criterion='gini', max_depth=max_depth, random_state=42, n_jobs=-1)
clf.fit(x_train,y_train)
sig_clf = CalibratedClassifierCV(clf, method="sigmoid")
sig_clf.fit(x_train,y_train)
return sig_clf
st.sidebar.write("Train",select_model)
if st.sidebar.button("Train model"):
with st.spinner("Training ongoing......"):
sig_clf= rf_model(n_estimators=n_estimators,max_depth=max_depth)
else:
sig_clf= rf_model(n_estimators=500,max_depth=10)
sig_clf.fit(x_train,y_train)
predict_y = sig_clf.predict_proba(x_train)
st.write("The train log loss is:",log_loss(y_train, predict_y, labels=[0,1], eps=1e-15))
predict_y = sig_clf.predict_proba(x_cross_val)
st.write("The cross validation log loss is:",log_loss(y_cross_val, predict_y, labels=[0,1], eps=1e-15))
predict_y = sig_clf.predict_proba(x_test)
st.write("The test log loss is:",log_loss(y_test, predict_y, labels=[0,1], eps=1e-15))
plot_confusion_matrix(x_test,y_test,sig_clf)
else:
st.write("For machine learning algorithm we send data to algorithm as a individual data point.")
# cross_val is other than last sequence data points (test data 41214 = cross_val(40966)+test(248))
cross = splite_data(1,Test,cross_val=True)
# we have 248 engine data,# We pick the last sequence data points as test data
Test_data = splite_data(1,Test,cross_val= False)
y_train = Train['RUL']
x_train = Train.drop(['RUL','label','Engine_No'],axis = 1)
y_cross_val = cross['RUL']
x_cross_val = cross.drop(['RUL','label','Engine_No'],axis = 1)
## Clipped RUL
y_train_clipped = y_train.clip(upper=125)
y_cross_val_clipped = y_cross_val.clip(upper=125)
y_test = Test_data['RUL'].reset_index(drop=True)
x_test = Test_data.drop(['RUL','label','Engine_No'],axis = 1).reset_index(drop=True)
st.write("Shape of Train data after split:",x_train.shape)
st.write("Shape of Train labeling data :",y_train_clipped.shape)
st.write("Shape of Cross validation data after split:",x_cross_val .shape)
st.write("Shape of Cross validation labeling data :",y_cross_val_clipped.shape)
st.write("Shape of Test data after split:",x_test.shape)
st.write("Shape of Test labeling data :",y_test.shape)
if st.checkbox("Show code for ML data processing"):
st.write('code for to Data processing:')
data_code = '''
def splite_data(sequence_length,data,cross_val=True):
val_data=[]
# we iterating through each engine number
for id in data['Engine_No'].unique():
if len(data[data['Engine_No']==id])>= sequence_length:
if cross_val:
# Remaining data points as cross_validation(other than last sequence data points).
val_data.append(data[data['Engine_No']==id][:-sequence_length])
else:
# We pick the last sequence data points as test data
val_data.append(Test[Test['Engine_No']==id][-sequence_length:])
else:
False
return pd.concat(val_data)'''
st.code(data_code, language='python')
select_model = st.selectbox("Please choose a Model from list", ("Linear regression","RF regression"))
# performance matrix
def evaluate(y_true, y_hat,data_type ="Train"):
mse = mean_squared_error(y_true, y_hat)
rmse = np.sqrt(mse)
variance = r2_score(y_true, y_hat)
return st.write("{} data RMSE: {} and R2: {} ".format(data_type, rmse,variance))
if select_model == "Linear regression":
st.write("Let’s train baseline model,",select_model,"on", data_set_type, "data set")
lm = LinearRegression()
lm.fit(x_train, y_train_clipped)
# predict and evaluate
y_hat_train = lm.predict(x_train)
evaluate(y_train_clipped, y_hat_train,"Train")
y_hat_cross = lm.predict(x_cross_val)
evaluate(y_cross_val_clipped, y_hat_cross,'Cross validation')
y_hat_cross = lm.predict(x_test)
evaluate(y_test, y_hat_cross,"Test")
st.line_chart (RUL_line(y_test, y_hat_cross))
else:
st.write("Let’s train model,",select_model,"on", data_set_type, "data set")
st.sidebar.markdown("----")
n_estimators = st.sidebar.select_slider('select number of estimators',options=[5,10,15,20,25,30,35,40,45,50,75,100,200,500,1000])
st.sidebar.write("Selected number of estimators are ",n_estimators)
max_depth = st.sidebar.select_slider('select max_depth of tree',options=[5,10,15,20,25,30,35])
st.sidebar.write("Selected tree depth is ",max_depth)
@st.cache(allow_output_mutation=True)
def rf_reg_model(n_estimators=30, max_depth=15):
clf = RandomForestRegressor(n_estimators=n_estimators,max_depth=max_depth, max_features='sqrt', min_samples_split=5, random_state=42, n_jobs=-1)
clf.fit(x_train,y_train)
return clf
st.sidebar.write("Train",select_model)
if st.sidebar.button("Train model"):
with st.spinner("Training ongoing......"):
clf= rf_reg_model(n_estimators=n_estimators, max_depth=max_depth)
# predict and evaluate
y_hat_train = clf.predict(x_train)
evaluate(y_train_clipped, y_hat_train,"Train")
y_hat_cross = clf.predict(x_cross_val)
evaluate(y_cross_val_clipped, y_hat_cross,'Cross validation')
y_hat_cross = clf.predict(x_test)
evaluate(y_test, y_hat_cross,"Test")
st.line_chart (RUL_line(y_test, y_hat_cross))
else:
# Run model with best hyper paramaters
clf_hy = RandomForestRegressor(n_estimators=30,max_depth=15, max_features='sqrt', min_samples_split=5, random_state=42, n_jobs=-1)
clf_hy.fit(x_train,y_train)
# predict and evaluate
y_hat_train = clf_hy.predict(x_train)
evaluate(y_train_clipped, y_hat_train,"Train")
y_hat_cross = clf_hy.predict(x_cross_val)
evaluate(y_cross_val_clipped, y_hat_cross,'Cross validation')
y_hat_cross = clf_hy.predict(x_test)
evaluate(y_test, y_hat_cross,"Test")
st.line_chart (RUL_line(y_test, y_hat_cross))
else:
def plot_confusion_matrix_dp(input,input_lable,model):
pred_y = np.argmax(model.predict(input), axis=-1)
test_y = input_lable
st.write("-"*20, "Confusion matrix", "-"*20)
cm = confusion_matrix(test_y,pred_y)
fig, ax = plt.subplots(figsize=(3,3))
ax = sns.heatmap(cm,annot=True, cmap='Blues')
fig.suptitle('Confusion matrix', fontsize=5)
plt.xlabel('Predicted Class', fontsize=5)
plt.ylabel('Original Class', fontsize=5)
st.pyplot(fig)
# calculating the number of data points that are misclassified
st.write("Number of mis-classified points :", (np.count_nonzero((pred_y- test_y))/test_y.shape[0]))
precision, recall, fscore, support = score(test_y,pred_y)
st.write('precision:', precision[0])
st.write('recall:',recall[0])
st.write('fscore:',fscore[0])
st.write('Negative class labels:',support[0],'positive class labels:',support[1])
def plot_train_val(history,loss,val_loss):
st.write("Plot training & validation accuracy graph")
fig, ax = plt.subplots()
ax[0].plot(history.history[loss])
ax[0].plot(history.history[val_loss])
plt.title('Model accuracy')
plt.ylabel('loss')
plt.xlabel('Epoch')
plt.legend(['Train', 'Test'], loc='upper left')
st.pyplot()
def rmse(y_true, y_pred):
return K.sqrt(K.mean(K.square(y_pred - y_true)))
def plot_loss(history):
plt.plot(history.history['loss'], label='Train loss')
plt.plot(history.history['val_loss'], label='Val_loss')
plt.xlabel('Epoch')
plt.ylabel('Error [MPG]')
plt.legend()
plt.grid(True)
st.pyplot()
def load_model(flter_1,drop1,Dense_1,drop2,optimizer):
model_cnn = Sequential()
model_cnn.add(Conv1D(filters=flter_1, kernel_size=2, activation='relu',input_shape=seq_gen_train.shape[1:]))
model_cnn.add(BatchNormalization())
model_cnn.add(Flatten())
model_cnn.add(Dropout(drop1))
model_cnn.add(Dense(Dense_1, activation='relu'))
model_cnn.add(BatchNormalization())
model_cnn.add(Dropout(drop2))
if select_model == "classification":
model_cnn.add(Dense(units=1, activation='sigmoid'))
model_cnn.compile(optimizer=optimizer,loss='binary_crossentropy',metrics=['accuracy'])
else:
model_cnn.add(Dense(units=1, activation='linear'))
model_cnn.compile(loss='mae', optimizer= optimizer,metrics=[rmse])
return model_cnn
def load_LSTM_model(units,drop1,Dense_1,drop2,optimizer):
model_LSTM = Sequential()
model_LSTM.add(LSTM(input_shape=seq_gen_train.shape[1:],units=32,return_sequences=False))
model_LSTM.add(BatchNormalization())
model_LSTM.add(Dropout(drop1))
model_LSTM.add(Dense(Dense_1, activation='relu'))
model_LSTM.add(BatchNormalization())
model_LSTM.add(Dropout(drop2))
if select_model == "classification":
model_LSTM.add(Dense(units=1, activation='sigmoid'))
model_LSTM.compile(optimizer=optimizer,loss='binary_crossentropy',metrics=['accuracy'])
else:
model_LSTM.add(Dense(units=1, activation='linear'))
model_LSTM.compile(loss='mae', optimizer= optimizer,metrics=[rmse])
return model_LSTM
Algo_type = st.selectbox("Please choose a Problem Type", ("Classification", "Regression"))
if Algo_type == "Classification":
st.write("**Deep learning:**")
st.write("For LSTM and Conv1d model expects data to have the shape: [samples, timesteps, features]")
st.write("we split the series with a fixed window and a sliding of 1 step. For example, engine 1 have 222 cycles in train, with a window length equal to 50 we extract 172 time series with length 50:")
st.write('window1 -> from cycle0 to cycle 50,')
st.write("window2 -> from cycle1 to cycle51, … ,")
st.write("window172 -> from cycle172 to cycle222")
st.write("Each window is labeled with the corresponding label of the final cycle taken into account by the window.")
x_train= Train
# cross_val is other than last sequence data points (test data 13096 = cross_val(10096)+test(300))
x_cross = splite_data(50,Test,cross_val=True)
# we have 248 engine data,# We pick the last sequence data points as test data
x_test = splite_data(50,Test,cross_val= False)
sequence_length =50
def gen_sequence(id_df, seq_length, seq_cols):
data_array = id_df[seq_cols].values
num_elements = data_array.shape[0]
for start, stop in zip(range(0, num_elements-seq_length), range(seq_length, num_elements)):
yield data_array[start:stop, :]
# generate labels
def gen_labels(id_df, seq_length, label):
data_array = id_df[label].clip(upper=125).values
num_elements = data_array.shape[0]
return data_array[seq_length:num_elements, :]
label_gen_train = [gen_labels(x_train[x_train['Engine_No']==id],sequence_length,['label']) for id in x_train['Engine_No'].unique()]
train_label_array = np.concatenate(label_gen_train).astype(np.float32)
label_gen_cross = [gen_labels(x_cross[x_cross['Engine_No']==id], sequence_length,['label'])
for id in x_cross['Engine_No'].unique() if len(x_cross[x_cross['Engine_No']==id])>= sequence_length]
cross_label_array = np.concatenate(label_gen_cross).astype(np.float32)
imp_col = ['T50','Nf','Ps30','phi','NRf','NRc','BPR','W31','W32','T48','T41','T90','Ve','EGT_margin','Nc/Nf','PCNcRdmd ','M_cold','W_f','Thrust','Fan_thrust','core_thrust','TSFC','Thermal_efficiency']
### GENERATE X TRAIN TEST ###
x_train_data, x_cross_data = [], []
for engine_id in x_train.Engine_No.unique():
for sequence in gen_sequence(x_train[x_train['Engine_No']==engine_id], sequence_length, imp_col):
x_train_data.append(sequence)
for sequence in gen_sequence(x_cross[x_cross['Engine_No']==engine_id], sequence_length,imp_col):
x_cross_data.append(sequence)
#******************
seq_gen_train = np.asarray(x_train_data)
seq_array_cross= np.asarray(x_cross_data)
st.write('Shape of Train data' ,seq_gen_train.shape)
st.write('Shape of Train labeling data',train_label_array.shape)
st.write('Shape of cross validation data',seq_array_cross.shape)
st.write('Shape of cross validation labeling data', cross_label_array.shape)
# We pick the last sequence for each engine id in the test data
seq_array_test_last = [x_test[x_test['Engine_No']==id][imp_col].values[-sequence_length:] for id in x_test['Engine_No'].unique() if len(x_test[x_test['Engine_No']==id])>= sequence_length]
seq_array_test_last = np.asarray(seq_array_test_last).astype(np.float32)
# Similarly, we pick the labels
y_mask = [len(x_test[x_test['Engine_No']==id]) >= sequence_length for id in x_test['Engine_No'].unique()]
label_array_test_last = x_test.groupby('Engine_No')['label'].nth(-1)[y_mask].values
label_array_test_last = label_array_test_last.reshape(label_array_test_last.shape[0],1).astype(np.float32)
st.write('Shape of Test data', seq_array_test_last.shape)
st.write('Shape of Test labeling data',label_array_test_last.shape)
if st.checkbox("Show code for Deep learning data processing"):
st.write('code for to Data processing for Deep learning modules:')
data_code_lstm ="""
def gen_sequence(id_df, seq_length, seq_cols):
data_array = id_df[seq_cols].values
num_elements = data_array.shape[0]
for start, stop in zip(range(0, num_elements-seq_length), range(seq_length, num_elements)):
yield data_array[start:stop, :]
# generate labels
def gen_labels(id_df, seq_length, label):
data_array = id_df[label].clip(upper=125).values
num_elements = data_array.shape[0]
return data_array[seq_length:num_elements, :]
label_gen_train = [gen_labels(x_train[x_train['Engine_No']==id],sequence_length,['label']) for id in x_train['Engine_No'].unique()]
train_label_array = np.concatenate(label_gen_train).astype(np.float32)
label_gen_cross = [gen_labels(x_cross[x_cross['Engine_No']==id], sequence_length,['label'])
for id in x_cross['Engine_No'].unique() if len(x_cross[x_cross['Engine_No']==id])>= sequence_length]
cross_label_array = np.concatenate(label_gen_cross).astype(np.float32)
imp_col = ['T50','Nf','Ps30','phi','NRf','NRc','BPR','W31','W32','T48','T41','T90','Ve','EGT_margin','Nc/Nf','PCNcRdmd ','M_cold','W_f','Thrust','Fan_thrust','core_thrust','TSFC','Thermal_efficiency']
### GENERATE X TRAIN TEST ###
x_train_data, x_cross_data = [], []
for engine_id in x_train.Engine_No.unique():
for sequence in gen_sequence(x_train[x_train['Engine_No']==engine_id], sequence_length, imp_col):
x_train_data.append(sequence)
for sequence in gen_sequence(x_cross[x_cross['Engine_No']==engine_id], sequence_length,imp_col):
x_cross_data.append(sequence)
#******************
seq_gen_train = np.asarray(x_train_data)
seq_array_cross= np.asarray(x_cross_data)
# We pick the last sequence for each engine id in the test data
seq_array_test_last = [x_test[x_test['Engine_No']==id][imp_col].values[-sequence_length:]
for id in x_test['Engine_No'].unique() if len(x_test[x_test['Engine_No']==id])>= sequence_length]
seq_array_test_last = np.asarray(seq_array_test_last).astype(np.float32)
# Similarly, we pick the labels
#print("y_mask")
y_mask = [len(x_test[x_test['Engine_No']==id]) >= sequence_length for id in x_test['Engine_No'].unique()]
label_array_test_last = x_test.groupby('Engine_No')['label'].nth(-1)[y_mask].values
label_array_test_last = label_array_test_last.reshape(label_array_test_last.shape[0],1).astype(np.float32)
print(label_array_test_last.shape)"""
st.code(data_code_lstm, language='python')
select_model = st.selectbox("Please choose a Model from list", ("CNN", "LSTM"))
if select_model == "CNN":
st.sidebar.markdown("----")
st.sidebar.subheader('Building your CNN')
flter_1= st.sidebar.select_slider('Number of filters in first layer: ',[8,16,32,64,128,256],value=32)
drop1=st.sidebar.select_slider('Drop rate in first layer', [0.1, 0.2,0.3,0.4,0.5],value=0.1)
Dense_1= st.sidebar.select_slider('Number of Dense in second layer: ',[8,16,32,64,128,256],value=8)
drop2=st.sidebar.select_slider('Drop rate in second layer', [0.1, 0.2,0.3,0.4,0.5],value=0.2)
optimizer = st.sidebar.select_slider('Optimizer', ['SGD', 'RMSprop', 'Adam'],value='Adam')
batch_size = st.sidebar.select_slider('Select batch size', [32, 64, 128, 256],value=32)
epochs=st.sidebar.select_slider('Select number of epochs', [10,20,30,50],value=30)
if st.button("Train a model"):
with st.spinner("Training may take a while, so grab a cup of coffee............."):
model = load_model(flter_1,drop1,Dense_1,drop2,optimizer)
history =model.fit(seq_gen_train,train_label_array,batch_size=batch_size,shuffle=True,epochs=epochs,validation_data=(seq_array_cross, cross_label_array))
plot_loss(history)
# Evaluation of training data
scores = model.evaluate(seq_gen_train, train_label_array, verbose=1)
st.write('logloss for training data', scores[0])
# Evaluation of cross val data
scores = model.evaluate(seq_array_cross, cross_label_array, verbose=1)
st.write('logloss for cross_val data', scores[0])
# Evaluation of Test data data
scores_test = model.evaluate(seq_array_test_last,label_array_test_last, verbose=2)
st.write('logloss for test data', scores[0])
plot_confusion_matrix_dp(seq_array_test_last,label_array_test_last,model)
else:
st.sidebar.markdown("----")
st.sidebar.subheader('Building your LSTM')
flter_1= st.sidebar.select_slider('Number of LSTM units in first layer: ',[8,16,32,64,128,256],value=32)
drop1=st.sidebar.select_slider('Drop rate in first layer', [0.1, 0.2,0.3,0.4,0.5],value=0.1)
Dense_1= st.sidebar.select_slider('Number of Dense in second layer: ',[8,16,32,64,128,256],value=8)
drop2=st.sidebar.select_slider('Drop rate in second layer', [0.1, 0.2,0.3,0.4,0.5],value=0.2)
optimizer = st.sidebar.select_slider('Optimizer', ['SGD', 'RMSprop', 'Adam'],value='Adam')
batch_size = st.sidebar.select_slider('Select batch size', [32, 64, 128, 256],value=32)
epochs=st.sidebar.select_slider('Select number of epochs', [10,20,30,50],value=30)
if st.button("Train a model"):
with st.spinner("Training may take a while, so grab a cup of coffee............."):
model = load_LSTM_model(flter_1,drop1,Dense_1,drop2,optimizer)
history =model.fit(seq_gen_train,train_label_array,batch_size=batch_size,shuffle=True,epochs=epochs,validation_data=(seq_array_cross, cross_label_array))
plot_loss(history)
# Evaluation of training data
scores = model.evaluate(seq_gen_train, train_label_array, verbose=1)
st.write('logloss for training data', scores[0])
# Evaluation of cross val data
scores = model.evaluate(seq_array_cross, cross_label_array, verbose=1)
st.write('logloss for cross_val data', scores[0])
# Evaluation of Test data data
scores_test = model.evaluate(seq_array_test_last,label_array_test_last, verbose=2)
st.write('logloss for test data', scores[0])
plot_confusion_matrix_dp(seq_array_test_last,label_array_test_last,model)
else:
st.write("**Deep learning:**")
st.write("For LSTM and Conv1d model expects data to have the shape: [samples, timesteps, features]")
st.write("we split the series with a fixed window and a sliding of 1 step. For example, engine 1 have 222 cycles in train, with a window length equal to 50 we extract 172 time series with length 50:")
st.write('window1 -> from cycle0 to cycle 50,')
st.write("window2 -> from cycle1 to cycle51, … ,")
st.write("window172 -> from cycle172 to cycle222")
st.write("Each window is labeled with the corresponding label of the final cycle taken into account by the window.")
x_train= Train
# cross_val is other than last sequence data points (test data 13096 = cross_val(10096)+test(300))
x_cross = splite_data(50,Test,cross_val=True)
# we have 248 engine data,# We pick the last sequence data points as test data
x_test = splite_data(50,Test,cross_val= False)
sequence_length =50
def gen_sequence(id_df, seq_length, seq_cols):
data_array = id_df[seq_cols].values
num_elements = data_array.shape[0]
for start, stop in zip(range(0, num_elements-seq_length), range(seq_length, num_elements)):
yield data_array[start:stop, :]
# generate labels
def gen_labels(id_df, seq_length, label):
data_array = id_df[label].clip(upper=125).values
num_elements = data_array.shape[0]
return data_array[seq_length:num_elements, :]
label_gen_train = [gen_labels(x_train[x_train['Engine_No']==id],sequence_length,['RUL']) for id in x_train['Engine_No'].unique()]
train_label_array = np.concatenate(label_gen_train).astype(np.float32)
label_gen_cross = [gen_labels(x_cross[x_cross['Engine_No']==id], sequence_length,['RUL'])
for id in x_cross['Engine_No'].unique() if len(x_cross[x_cross['Engine_No']==id])>= sequence_length]
cross_label_array = np.concatenate(label_gen_cross).astype(np.float32)
imp_col = ['T50','Nf','Ps30','phi','NRf','NRc','BPR','W31','W32','T48','T41','T90','Ve','EGT_margin','Nc/Nf','PCNcRdmd ','M_cold','W_f','Thrust','Fan_thrust','core_thrust','TSFC','Thermal_efficiency']
### GENERATE X TRAIN TEST ###
x_train_data, x_cross_data = [], []
for engine_id in x_train.Engine_No.unique():
for sequence in gen_sequence(x_train[x_train['Engine_No']==engine_id], sequence_length, imp_col):
x_train_data.append(sequence)
for sequence in gen_sequence(x_cross[x_cross['Engine_No']==engine_id], sequence_length,imp_col):
x_cross_data.append(sequence)
#******************
seq_gen_train = np.asarray(x_train_data)
seq_array_cross= np.asarray(x_cross_data)
st.write('Shape of Train data' ,seq_gen_train.shape)
st.write('Shape of Train labeling data',train_label_array.shape)
st.write('Shape of cross validation data',seq_array_cross.shape)
st.write('Shape of cross validation labeling data', cross_label_array.shape)
# We pick the last sequence for each engine id in the test data
seq_array_test_last = [x_test[x_test['Engine_No']==id][imp_col].values[-sequence_length:] for id in x_test['Engine_No'].unique() if len(x_test[x_test['Engine_No']==id])>= sequence_length]
seq_array_test_last = np.asarray(seq_array_test_last).astype(np.float32)
# Similarly, we pick the labels
y_mask = [len(x_test[x_test['Engine_No']==id]) >= sequence_length for id in x_test['Engine_No'].unique()]
label_array_test_last = x_test.groupby('Engine_No')['RUL'].nth(-1)[y_mask].values
label_array_test_last = label_array_test_last.reshape(label_array_test_last.shape[0],1).astype(np.float32)
st.write('Shape of Test data', seq_array_test_last.shape)
st.write('Shape of Test labeling data',label_array_test_last.shape)
if st.checkbox("Show code for Deep learning data processing"):
st.write('code for to Data processing for Deep learning modules:')
data_code_lstm ="""
def gen_sequence(id_df, seq_length, seq_cols):
data_array = id_df[seq_cols].values
num_elements = data_array.shape[0]
for start, stop in zip(range(0, num_elements-seq_length), range(seq_length, num_elements)):
yield data_array[start:stop, :]
# generate labels
def gen_labels(id_df, seq_length, label):
data_array = id_df[label].clip(upper=125).values
num_elements = data_array.shape[0]
return data_array[seq_length:num_elements, :]
label_gen_train = [gen_labels(x_train[x_train['Engine_No']==id],sequence_length,['RUL']) for id in x_train['Engine_No'].unique()]
train_label_array = np.concatenate(label_gen_train).astype(np.float32)
label_gen_cross = [gen_labels(x_cross[x_cross['Engine_No']==id], sequence_length,['RUL'])
for id in x_cross['Engine_No'].unique() if len(x_cross[x_cross['Engine_No']==id])>= sequence_length]
cross_label_array = np.concatenate(label_gen_cross).astype(np.float32)
imp_col = ['T50','Nf','Ps30','phi','NRf','NRc','BPR','W31','W32','T48','T41','T90','Ve','EGT_margin','Nc/Nf','PCNcRdmd ','M_cold','W_f','Thrust','Fan_thrust','core_thrust','TSFC','Thermal_efficiency']
### GENERATE X TRAIN TEST ###
x_train_data, x_cross_data = [], []
for engine_id in x_train.Engine_No.unique():
for sequence in gen_sequence(x_train[x_train['Engine_No']==engine_id], sequence_length, imp_col):
x_train_data.append(sequence)
for sequence in gen_sequence(x_cross[x_cross['Engine_No']==engine_id], sequence_length,imp_col):
x_cross_data.append(sequence)
#******************
seq_gen_train = np.asarray(x_train_data)
seq_array_cross= np.asarray(x_cross_data)
# We pick the last sequence for each engine id in the test data
seq_array_test_last = [x_test[x_test['Engine_No']==id][imp_col].values[-sequence_length:]
for id in x_test['Engine_No'].unique() if len(x_test[x_test['Engine_No']==id])>= sequence_length]
seq_array_test_last = np.asarray(seq_array_test_last).astype(np.float32)
# Similarly, we pick the labels
#print("y_mask")
y_mask = [len(x_test[x_test['Engine_No']==id]) >= sequence_length for id in x_test['Engine_No'].unique()]
label_array_test_last = x_test.groupby('Engine_No')['RUL'].nth(-1)[y_mask].values
label_array_test_last = label_array_test_last.reshape(label_array_test_last.shape[0],1).astype(np.float32)
print(label_array_test_last.shape)"""
st.code(data_code_lstm, language='python')
select_model = st.selectbox("Please choose a Model from list", ("CNN", "LSTM"))
if select_model == "CNN":
st.sidebar.markdown("----")
st.sidebar.subheader('Building your CNN')
flter_1= st.sidebar.select_slider('Number of filters in first layer: ',[8,16,32,64,128,256],value=32)
drop1=st.sidebar.select_slider('Drop rate in first layer', [0.1, 0.2,0.3,0.4,0.5],value=0.1)
Dense_1= st.sidebar.select_slider('Number of Dense in second layer: ',[8,16,32,64,128,256],value=8)
drop2=st.sidebar.select_slider('Drop rate in second layer', [0.1, 0.2,0.3,0.4,0.5],value=0.2)
optimizer = st.sidebar.select_slider('Optimizer', ['SGD', 'RMSprop', 'Adam'],value='Adam')
batch_size = st.sidebar.select_slider('Select batch size', [32, 64, 128, 256],value=32)
epochs=st.sidebar.select_slider('Select number of epochs', [10,20,30,50],value=30)
if st.button("Train a model"):
with st.spinner("Training may take a while, so grab a cup of coffee............."):
model = load_model(flter_1,drop1,Dense_1,drop2,optimizer)
history =model.fit(seq_gen_train,train_label_array,batch_size=batch_size,shuffle=True,epochs=epochs,validation_data=(seq_array_cross, cross_label_array))
plot_loss(history)
# Evaluation of training data
scores = model.evaluate(seq_gen_train, train_label_array, verbose=1)
st.write('MAE on Training data', scores[0])
st.write('RMSE on Training data', scores[1])
# Evaluation of cross val data
scores = model.evaluate(seq_array_cross, cross_label_array, verbose=1)
st.write('MAE on Cross_val data', scores[0])
st.write('RMSE on Cross_val data', scores[1])
# Evaluation of Test data data
scores_test = model.evaluate(seq_array_test_last,label_array_test_last, verbose=2)
st.write('MAE on Test data', scores[0])
st.write('RMSE on Test data', scores[1])
y_pred_test = model.predict(seq_array_test_last)
y_true_test = label_array_test_last
st.line_chart (RUL_line(y_true_test.flatten(), y_pred_test.flatten()))
else:
st.sidebar.markdown("----")
st.sidebar.subheader('Building your LSTM')
flter_1= st.sidebar.select_slider('Number of LSTM units in first layer: ',[8,16,32,64,128,256],value=32)
drop1=st.sidebar.select_slider('Drop rate in first layer', [0.1, 0.2,0.3,0.4,0.5],value=0.1)
Dense_1= st.sidebar.select_slider('Number of Dense in second layer: ',[8,16,32,64,128,256],value=8)
drop2=st.sidebar.select_slider('Drop rate in second layer', [0.1, 0.2,0.3,0.4,0.5],value=0.2)
optimizer = st.sidebar.select_slider('Optimizer', ['SGD', 'RMSprop', 'Adam'],value='Adam')
batch_size = st.sidebar.select_slider('Select batch size', [32, 64, 128, 256],value=32)
epochs=st.sidebar.select_slider('Select number of epochs', [10,20,30,50],value=30)
if st.button("Train a model"):
with st.spinner("Training may take a while, so grab a cup of coffee............."):
model = load_LSTM_model(flter_1,drop1,Dense_1,drop2,optimizer)
history =model.fit(seq_gen_train,train_label_array,batch_size=batch_size,shuffle=True,epochs=epochs,validation_data=(seq_array_cross, cross_label_array))
plot_loss(history)
# Evaluation of training data
scores = model.evaluate(seq_gen_train, train_label_array, verbose=1)
st.write('MAE on Training data', scores[0])
st.write('RMSE on Training data', scores[1])
# Evaluation of cross val data
scores = model.evaluate(seq_array_cross, cross_label_array, verbose=1)
st.write('MAE on Cross_val data', scores[0])
st.write('RMSE on Cross_val data', scores[1])
# Evaluation of Test data data
scores_test = model.evaluate(seq_array_test_last,label_array_test_last, verbose=2)
st.write('MAE on Test data', scores[0])
st.write('RMSE on Test data', scores[1])
y_pred_test = model.predict(seq_array_test_last)
y_true_test = label_array_test_last
st.line_chart (RUL_line(y_true_test.flatten(), y_pred_test.flatten()))
st.subheader("Conclusion")
st.write("The task of predicting RUL is certainly interesting.But there is still quite a bit of room for improvement with better training, models, and feature engineering.")
st.write("Our model give good results,but that's not good enough for aviation standards we need to improve model performance as follows:")
st.write("- Deep learning models are not performing as we expected maybe deep learning model performs well if have large data sets.")
st.write("- As we know that data generated by sensors, it means that the data set is contaminated with sensor noise. we can improve model performances by applying denoising techniques.")
st.markdown("---")
link_2= 'You can find the full code on my github page [here](https://github.com/gethgle/Predictive-maintenance-of-turbofan-engines)'
st.markdown(link_2, unsafe_allow_html=True)
def model_predi():
st.title("Model Prediction")
st.write("Let’s predict how models are performing by giveing unseen data.")
st.sidebar.markdown("---")
data_set_type= st.sidebar.selectbox("Please choose a Data set", ("FD001", "FD002","FD003", "FD004"))
@st.cache
def imp_features(data_path):