-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathModelGeneration.R
79 lines (59 loc) · 2.98 KB
/
ModelGeneration.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
## Load the required library
library(e1071)
library(caret)
## Set a working directory (the folder postion) in your computer, this step specifies the location of your files
Working_directory <- "C:/Users/User/Desktop/Avir.Demo_2.0"
setwd(Working_directory)
#################################################################
## The following code is an example of how I calculated the value of the Avir feature
## Read the table of metabolic feature using peak area and peak height to represent the intensity respectively
df_PA = read.csv('PeakArea_Demo.csv')
df_PH = read.csv('PeakHeight_Demo.csv')
df_Label = read.csv("Label.csv")
## Create a dataframe to store the prediction results of SVM model
df_result <- as.data.frame(df_PA[,1:3])
df_result$Prediction <- NA
## Create a dataframe to store the values of SVM model's machine learning feature
df_predict <- as.data.frame(df_result[, 1])
colnames(df_predict)[1] <- "Alignment ID"
df_predict$Spearman_Cor <- NA
df_predict$Pearson_Cor <- NA
df_predict$RSD_PAPH <- NA
df_predict$norm_diff_PA_PH_median <- NA
## Here is the example code for calculation of features of SVM, you may need to modify by case.
## If using my given format then you can just use this.
## The purpose of each step is shown below.
for (i in 1:nrow(df_PA) ) {
## In the For loop, we first create a data frame to store the peak area and peak height of every metabolic feature.
## You may need to change this line of code if you use different formats as input.
df_feature <- rbind(as.numeric(df_PA[i, 4:ncol(df_PA) ]), as.numeric(df_PH[i, 4:ncol(df_PA) ]))
##############################################################################
## You don't need to change following code, it is just formatting
df_feature <- t(df_feature)
colnames(df_feature ) = c("PA", "PH")
df_feature[,1] = as.numeric(df_feature[,1])
df_feature[,2] = as.numeric(df_feature[,2])
df_feature = as.data.frame(df_feature )
#########################################################
## Calculate the Spearman correlation and Pearson correlation
spearman_cor1 = cor(y = df_feature[,1], x = df_feature[,2], method = "spearman")
pearson_cor1 = cor(y = df_feature[,1], x = df_feature[,2], method = "pearson")
df_predict$Spearman_Cor[i] = spearman_cor1
df_predict$Pearson_Cor[i] = pearson_cor1
# Calculate the ratio of peak area to peak height first, then calculate the RSD of PA/PH and range-median ratio of PA/PH
Ratio1 = df_feature[,1] / df_feature[,2]
mean1 = mean(Ratio1)
sd1= sd(Ratio1)
rsd1 = sd1/mean1
df_predict$RSD_PAPH[i] = rsd1
df_predict$norm_diff_PA_PH_median[i] = ( max(Ratio1) - min(Ratio1) ) / median(Ratio1)
}
Trainingdata <- cbind(df_predict[,-1], df_Label[,2])
colnames(Trainingdata)[5] <- "Label"
## Train the SVM model
Classifier = svm(formula = Label ~ .,
data = Trainingdata,
type = 'C-classification',
kernel = 'linear',
probability=TRUE)
saveRDS(Classifier, file = "SVM.rds")