From feee4c923cad65368a0df8e98a9b0665c00ac4be Mon Sep 17 00:00:00 2001 From: Dilum Aluthge Date: Sat, 19 May 2018 03:58:36 -0400 Subject: [PATCH 01/62] Bump version number from "v0.12.0" to "v0.13.0-DEV" --- src/base/version.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/base/version.jl b/src/base/version.jl index 27b0f0ae2..576dd2f68 100644 --- a/src/base/version.jl +++ b/src/base/version.jl @@ -1,5 +1,5 @@ const VERSION = try - convert(VersionNumber, "v0.12.0") + convert(VersionNumber, "v0.13.0-DEV") catch e warn("WARN While creating PredictMD.VERSION, ignoring error $(e)") VersionNumber(0) From 5cd86912ba7376cef12e1860b30e3245cec8abeb Mon Sep 17 00:00:00 2001 From: Dilum Aluthge Date: Sat, 19 May 2018 18:23:03 -0400 Subject: [PATCH 02/62] Update installation instructions --- README.md | 15 ++++----------- 1 file changed, 4 insertions(+), 11 deletions(-) diff --git a/README.md b/README.md index 7a2517629..bfd4f9e2d 100644 --- a/README.md +++ b/README.md @@ -31,13 +31,10 @@ PredictMD is a [Julia](https://julialang.org/) package that provides a uniform i 2. Installation (recommended method) - 3. Installation (alternate method with Conda) + 3. Examples - 4. Examples - - - 5. Contributing + 4. Contributing @@ -126,11 +123,7 @@ If you receive the message "INFO: PredictMD tests passed", then you have success If the tests still do not pass or if you still receive an error message, go to [https://github.com/bcbi/PredictMD.jl/issues/new](https://github.com/bcbi/PredictMD.jl/issues/new) and submit a new issue. Please include a screenshot of the error. -## 3. Installation (alternate method with Conda) - -Alternatively, you can install PredictMD using the Conda package manager. Just follow the instructions here: [https://github.com/dilumaluthge/conda-predictmd-cpu](https://github.com/dilumaluthge/conda-predictmd-cpu) - -## 4. Examples +## 3. Examples The `examples/` folder contains several files that illustrate the usage of PredictMD: @@ -158,6 +151,6 @@ The `examples/` folder contains several files that illustrate the usage of Predi
-## 5. Contributing +## 4. Contributing If you would like to contribute to the PredictMD source code, please see [CONTRIBUTING.md](CONTRIBUTING.md). From b480618d862ed654c83af619c2287d1e15b90dc4 Mon Sep 17 00:00:00 2001 From: Dilum Aluthge Date: Sat, 19 May 2018 19:31:14 -0400 Subject: [PATCH 03/62] Catch errors when training --- src/linearmodel/glm.jl | 27 ++++++++++++++++++++------- src/neuralnetwork/knet.jl | 2 +- src/svm/libsvm.jl | 27 ++++++++++++++++++++------- src/tree/decisiontree.jl | 26 +++++++++++++++++++------- 4 files changed, 60 insertions(+), 22 deletions(-) diff --git a/src/linearmodel/glm.jl b/src/linearmodel/glm.jl index 6fc2f2e0a..b19739ebb 100644 --- a/src/linearmodel/glm.jl +++ b/src/linearmodel/glm.jl @@ -12,7 +12,7 @@ mutable struct GLMModel <: AbstractEstimator link::T6 where T6 <: GLM.Link # parameters (learned from data): - underlyingglm::T where T + underlyingglm::T7 where T7 <: Union{Void, StatsModels.DataFrameRegressionModel} function GLMModel( formula::StatsModels.Formula, @@ -22,6 +22,7 @@ mutable struct GLMModel <: AbstractEstimator isclassificationmodel::Bool = false, isregressionmodel::Bool = false, ) + underlyingglm = nothing result = new( name, isclassificationmodel, @@ -29,6 +30,7 @@ mutable struct GLMModel <: AbstractEstimator formula, family, link, + underlyingglm, ) return result end @@ -65,12 +67,23 @@ function fit!( ) labelsandfeaturesdf = hcat(labelsdf, featuresdf) info(string("INFO Starting to train GLM.jl model.")) - glm = GLM.glm( - estimator.formula, - labelsandfeaturesdf, - estimator.family, - estimator.link, - ) + glm = try + GLM.glm( + estimator.formula, + labelsandfeaturesdf, + estimator.family, + estimator.link, + ) + catch e + warn( + string( + "WARN while training GLM.jl model, ignored error: ", + e, + ) + ) + nothing + end + # glm = info(string("INFO Finished training GLM.jl model.")) estimator.underlyingglm = glm return estimator diff --git a/src/neuralnetwork/knet.jl b/src/neuralnetwork/knet.jl index 56b5e8c53..434da5d18 100644 --- a/src/neuralnetwork/knet.jl +++ b/src/neuralnetwork/knet.jl @@ -19,7 +19,7 @@ mutable struct KnetModel <: AbstractEstimator # parameters (learned from data): modelweights::T12 where T12 <: AbstractArray - modelweightoptimizers::T13 where T13 + modelweightoptimizers::T13 where T13 <: Any # TODO: do something better here # learning state history::T where T <: ValueHistories.MultivalueHistory diff --git a/src/svm/libsvm.jl b/src/svm/libsvm.jl index cd2cb7ed6..31a66fb43 100644 --- a/src/svm/libsvm.jl +++ b/src/svm/libsvm.jl @@ -11,7 +11,7 @@ mutable struct LIBSVMModel <: AbstractEstimator hyperparameters::T5 where T5 <: Associative # parameters (learned from data): - underlyingsvm::T6 where T6 + underlyingsvm::T6 where T6 <: Union{Void, LIBSVM.SVM} function LIBSVMModel( ; @@ -49,12 +49,14 @@ mutable struct LIBSVMModel <: AbstractEstimator hyperparameters[:cachesize] = cachesize hyperparameters[:verbose] = verbose hyperparameters = fix_dict_type(hyperparameters) + underlyingsvm = nothing result = new( name, isclassificationmodel, isregressionmodel, singlelabellevels, hyperparameters, + underlyingsvm, ) return result end @@ -97,12 +99,23 @@ function fit!( error("Could not figure out if model is classification or regression") end info(string("INFO Starting to train LIBSVM.jl model.")) - svm = LIBSVM.svmtrain( - featuresarray, - labelsarray; - probability = probability, - estimator.hyperparameters... - ) + svm = try + LIBSVM.svmtrain( + featuresarray, + labelsarray; + probability = probability, + estimator.hyperparameters... + ) + catch e + warn( + string( + "While training LIBSVM.jl model, ignored error: ", + e, + ) + ) + nothing + end + # svm = info(string("INFO Finished training LIBSVM.jl model.")) estimator.underlyingsvm = svm @assert(typeof(estimator.underlyingsvm.labels) <: AbstractVector) diff --git a/src/tree/decisiontree.jl b/src/tree/decisiontree.jl index 2731e8e7c..2257e5c16 100644 --- a/src/tree/decisiontree.jl +++ b/src/tree/decisiontree.jl @@ -13,7 +13,7 @@ mutable struct DecisionTreeModel <: hyperparameters::T6 where T6 <: Associative # parameters (learned from data): - underlyingrandomforest::T7 where T7 + underlyingrandomforest::T7 where T7 <: Union{Void, DecisionTree.Ensemble} function DecisionTreeModel( singlelabelname::Symbol; @@ -28,6 +28,7 @@ mutable struct DecisionTreeModel <: hyperparameters[:nsubfeatures] = nsubfeatures hyperparameters[:ntrees] = ntrees hyperparameters = fix_dict_type(hyperparameters) + underlyingrandomforest = nothing result = new( name, isclassificationmodel, @@ -35,6 +36,7 @@ mutable struct DecisionTreeModel <: singlelabelname, levels, hyperparameters, + underlyingrandomforest, ) return result end @@ -74,12 +76,22 @@ function fit!( labelsarray::AbstractArray, ) info(string("INFO Starting to train DecisionTree.jl model.")) - randomforest = DecisionTree.build_forest( - labelsarray, - featuresarray, - estimator.hyperparameters[:nsubfeatures], - estimator.hyperparameters[:ntrees], - ) + randomforest = try + DecisionTree.build_forest( + labelsarray, + featuresarray, + estimator.hyperparameters[:nsubfeatures], + estimator.hyperparameters[:ntrees], + ) + catch e + warn( + string( + "While training DecisionTree.jl model, ignored error: ", + e, + ) + ) + nothing + end info(string("INFO Finished training DecisionTree.jl model.")) estimator.underlyingrandomforest = randomforest return estimator From 7a5b40751bdcd392e10b1d31ba792c01a9c3fbaf Mon Sep 17 00:00:00 2001 From: Dilum Aluthge Date: Sat, 19 May 2018 21:16:30 -0400 Subject: [PATCH 04/62] If model is not defined, return all zeros --- src/linearmodel/glm.jl | 26 +++++++++------ src/neuralnetwork/knet.jl | 4 +-- src/svm/libsvm.jl | 68 ++++++++++++--------------------------- src/tree/decisiontree.jl | 32 ++++++++++++------ 4 files changed, 61 insertions(+), 69 deletions(-) diff --git a/src/linearmodel/glm.jl b/src/linearmodel/glm.jl index b19739ebb..8dd877118 100644 --- a/src/linearmodel/glm.jl +++ b/src/linearmodel/glm.jl @@ -103,17 +103,19 @@ function predict( ) result = DataFrames.DataFrame() labelname = estimator.formula.lhs - @assert(typeof(labelname) <: Symbol) result[labelname] = predictionsvector return result elseif !estimator.isclassificationmodel && estimator.isregressionmodel - glmpredictoutput = GLM.predict( - estimator.underlyingglm, - featuresdf, - ) + if is_nothing(estimator.underlyingglm) + glmpredictoutput = zeros(size(featuresdf,1)) + else + glmpredictoutput = GLM.predict( + estimator.underlyingglm, + featuresdf, + ) + end result = DataFrames.DataFrame() labelname = estimator.formula.lhs - @assert(typeof(labelname) <: Symbol) result[labelname] = glmpredictoutput return result else @@ -126,10 +128,14 @@ function predict_proba( featuresdf::DataFrames.AbstractDataFrame, ) if estimator.isclassificationmodel && !estimator.isregressionmodel - glmpredictoutput = GLM.predict( - estimator.underlyingglm, - featuresdf, - ) + if is_nothing(estimator.underlyingglm,) + glmpredictoutput = zeros(size(featuresdf, 1)) + else + glmpredictoutput = GLM.predict( + estimator.underlyingglm, + featuresdf, + ) + end result = Dict() result[1] = glmpredictoutput result[0] = 1 - glmpredictoutput diff --git a/src/neuralnetwork/knet.jl b/src/neuralnetwork/knet.jl index 434da5d18..91bfe9a51 100644 --- a/src/neuralnetwork/knet.jl +++ b/src/neuralnetwork/knet.jl @@ -168,8 +168,8 @@ function fit!( validation_lossbeforetrainingstarts = estimator.loss( estimator.predict, estimator.modelweights, - training_features_array, - training_labels_array; + validation_features_array, + validation_labels_array; estimator.losshyperparameters... ) end diff --git a/src/svm/libsvm.jl b/src/svm/libsvm.jl index 31a66fb43..f348dc860 100644 --- a/src/svm/libsvm.jl +++ b/src/svm/libsvm.jl @@ -137,48 +137,18 @@ function predict( ) return predictionsvector elseif !estimator.isclassificationmodel && estimator.isregressionmodel - predictedvalues, decisionvalues = LIBSVM.svmpredict( - estimator.underlyingsvm, - featuresarray, - ) - @assert(typeof(predictedvalues) <: AbstractVector) - @assert(ndims(predictedvalues) == 1) - @assert(typeof(decisionvalues) <: AbstractMatrix) - @assert(ndims(decisionvalues) == 2) - @assert(size(predictedvalues, 1) == size(decisionvalues, 2)) - @assert(size(decisionvalues, 1) == 2) - if !( isapprox(sum(abs, decisionvalues[2, :]), 0.0) ) - msg = string( - "sum(abs, decisionvalues[2, :]) is not approx zero. ", - "sum abs: ", - sum(abs, decisionvalues[2, :]), - ". mean abs: ", - mean(abs, decisionvalues[2, :]), - ".", - ) - error(msg) - end - if !( - all( - isapprox.( - predictedvalues[:], - decisionvalues[1, :] - ) - ) - ) - differences = predictedvalues[:] .- decisionvalues[1, :] - msg = string( - "not all predictedvalues[:] are approx equal to ", - "decisionvalues[1, :]. sum abs difference: ", - sum(abs, differences), - ". mean abs difference: ", - mean(abs, differences), - "." + if is_nothing(estimator.underlyingsvm) + predicted_values = zeros(size(featuresarray, 2)) + else + predicted_values, decision_values = LIBSVM.svmpredict( + estimator.underlyingsvm, + featuresarray, ) - error(msg) + if !(typeof(predicted_values) <: AbstractVector) + error("!(typeof(predicted_values) <: AbstractVector)") + end end - result = convert(Vector, vec(predictedvalues)) - return result + return predicted_values else error("Could not figure out if model is classification or regression") end @@ -189,16 +159,20 @@ function predict_proba( featuresarray::AbstractArray, ) if estimator.isclassificationmodel && !estimator.isregressionmodel - estimator.levels = estimator.underlyingsvm.labels - predictedlabels, decisionvalues = LIBSVM.svmpredict( - estimator.underlyingsvm, - featuresarray, - ) - decisionvaluestransposed = transpose(decisionvalues) + if is_nothing(estimator.underlyingsvm) + decision_values = zeros( + size(featuresarray, 2), + length(estimator.underlyingsvm.labels), + ) + decision_values[:, 1] = 1 + else + predicted_labels, decision_values = LIBSVM.svmpredict(estimator.underlyingsvm,featuresarray,) + decision_values = transpose(decision_values) + end result = Dict() for i = 1:length(estimator.underlyingsvm.labels) result[estimator.underlyingsvm.labels[i]] = - decisionvaluestransposed[:, i] + decision_values[:, i] end result = fix_dict_type(result) return result diff --git a/src/tree/decisiontree.jl b/src/tree/decisiontree.jl index 2257e5c16..59e9e88bf 100644 --- a/src/tree/decisiontree.jl +++ b/src/tree/decisiontree.jl @@ -111,11 +111,15 @@ function predict( ) return predictionsvector elseif !estimator.isclassificationmodel && estimator.isregressionmodel - output = DecisionTree.apply_forest( - estimator.underlyingrandomforest, - featuresarray, - ) - return output + if is_nothing(estimator.underlyingrandomforest) + predicted_values = zeros(size(featuresarray,1)) + else + predicted_values = DecisionTree.apply_forest( + estimator.underlyingrandomforest, + featuresarray, + ) + end + return predicted_values else error("Could not figure out if model is classification or regression") end @@ -126,11 +130,19 @@ function predict_proba( featuresarray::AbstractArray, ) if estimator.isclassificationmodel && !estimator.isregressionmodel - predictedprobabilities = DecisionTree.apply_forest_proba( - estimator.underlyingrandomforest, - featuresarray, - estimator.levels, - ) + if is_nothing(estimator.underlyingrandomforest) + predictedprobabilities = zeros( + size(featuresarray, 1), + length(estimator.levels), + ) + predictedprobabilities[:, 1] = 1 + else + predictedprobabilities = DecisionTree.apply_forest_proba( + estimator.underlyingrandomforest, + featuresarray, + estimator.levels, + ) + end result = Dict() for i = 1:length(estimator.levels) result[estimator.levels[i]] = predictedprobabilities[:, i] From 2cf6b8b8f843b0f6794395d67fc2e13ab9353ff5 Mon Sep 17 00:00:00 2001 From: Dilum Aluthge Date: Sat, 19 May 2018 21:19:23 -0400 Subject: [PATCH 05/62] Remove all assert statements --- examples/breast_cancer_biopsy.jl | 2 -- src/metrics/getbinarythresholds.jl | 1 - src/metrics/prcurve.jl | 5 ----- src/metrics/roccurve.jl | 4 ---- src/modelselection/split_data.jl | 2 -- src/neuralnetwork/knet.jl | 1 - src/preprocessing/dataframetodecisiontree.jl | 1 - src/preprocessing/dataframetoknet.jl | 2 -- src/svm/libsvm.jl | 1 - src/utils/probabilitiestopredictions.jl | 2 -- .../functional/breastcancerbiopsy/run_breastcancerbiopsy.jl | 2 -- 11 files changed, 23 deletions(-) diff --git a/examples/breast_cancer_biopsy.jl b/examples/breast_cancer_biopsy.jl index 6f2fb677a..17b463c72 100644 --- a/examples/breast_cancer_biopsy.jl +++ b/examples/breast_cancer_biopsy.jl @@ -513,8 +513,6 @@ function knetmlp_predict( else normalizedlogprobs = Knet.logp(unnormalizedlogprobs, 1) normalizedprobs = exp.(normalizedlogprobs) - @assert(all(0 .<= normalizedprobs .<= 1)) - @assert(all(isapprox.(sum(normalizedprobs, 1),1.0;atol = 0.00001,))) return normalizedprobs end end diff --git a/src/metrics/getbinarythresholds.jl b/src/metrics/getbinarythresholds.jl index 058e143d9..a1a8699ed 100644 --- a/src/metrics/getbinarythresholds.jl +++ b/src/metrics/getbinarythresholds.jl @@ -22,6 +22,5 @@ function get_binary_thresholds( ); rev = false, ) - @assert(typeof(result) <: AbstractVector) return result end diff --git a/src/metrics/prcurve.jl b/src/metrics/prcurve.jl index cf9d6ed9c..2fdebf7f3 100644 --- a/src/metrics/prcurve.jl +++ b/src/metrics/prcurve.jl @@ -22,11 +22,6 @@ function prcurve( ) allprecisions = [precision(x) for x in allrocnums] allrecalls = [recall(x) for x in allrocnums] - # - @assert(typeof(allprecisions) <: AbstractVector) - @assert(typeof(allrecalls) <: AbstractVector) - @assert(typeof(allthresholds) <: AbstractVector) - # permutation = sortperm(allthresholds; rev = false) allprecisions = allprecisions[permutation] allrecalls = allrecalls[permutation] diff --git a/src/metrics/roccurve.jl b/src/metrics/roccurve.jl index 0add252af..2d6220b46 100644 --- a/src/metrics/roccurve.jl +++ b/src/metrics/roccurve.jl @@ -23,10 +23,6 @@ function roccurve( allfpr = [false_positive_rate(x) for x in allrocnums] alltpr = [true_positive_rate(x) for x in allrocnums] # - @assert(typeof(allfpr) <: AbstractVector) - @assert(typeof(alltpr) <: AbstractVector) - @assert(typeof(allthresholds) <: AbstractVector) - # permutation = sortperm(allthresholds; rev = false) allfpr = allfpr[permutation] alltpr = alltpr[permutation] diff --git a/src/modelselection/split_data.jl b/src/modelselection/split_data.jl index b8f5006c8..37b08d75f 100644 --- a/src/modelselection/split_data.jl +++ b/src/modelselection/split_data.jl @@ -30,8 +30,6 @@ function split_data( num_rows = size(featuresdf, 1) num_partition_1 = round(Int, split * num_rows) num_partition_2 = num_rows - num_partition_1 - @assert(num_partition_1 + num_partition_2 == num_rows) - @assert( isapprox(num_partition_1/num_rows, split; atol=0.1) ) allrows = convert(Array, 1:num_rows) partition_1_rows = StatsBase.sample( rng, diff --git a/src/neuralnetwork/knet.jl b/src/neuralnetwork/knet.jl index 91bfe9a51..1b6d1b45c 100644 --- a/src/neuralnetwork/knet.jl +++ b/src/neuralnetwork/knet.jl @@ -334,7 +334,6 @@ function predict_proba( ) outputtransposed = transpose(output) numclasses = size(outputtransposed, 2) - @assert(numclasses > 0) result = Dict() for i = 1:numclasses result[i] = outputtransposed[:, i] diff --git a/src/preprocessing/dataframetodecisiontree.jl b/src/preprocessing/dataframetodecisiontree.jl index b1efbf9e0..ee7213e7a 100644 --- a/src/preprocessing/dataframetodecisiontree.jl +++ b/src/preprocessing/dataframetodecisiontree.jl @@ -54,7 +54,6 @@ function transform( ) singlelabelname = transformer.singlelabelname labelsarray = convert(Array, labelsdf[singlelabelname]) - @assert(typeof(labelsarray) <: AbstractVector) modelformula = generate_formula( transformer.featurenames[1], transformer.featurenames; diff --git a/src/preprocessing/dataframetoknet.jl b/src/preprocessing/dataframetoknet.jl index 6d209fff9..409f28a8c 100644 --- a/src/preprocessing/dataframetoknet.jl +++ b/src/preprocessing/dataframetoknet.jl @@ -214,8 +214,6 @@ function transform( ) training_labels_array = [labelstring2intmap_1[y] for y in training_labels_df[label_1]] - @assert(typeof(training_labels_array) <: AbstractVector) - @assert(length(training_labels_array) == size(training_labels_df, 1)) else training_labels_array = Array{Int}( size(training_labels_df, 1), diff --git a/src/svm/libsvm.jl b/src/svm/libsvm.jl index f348dc860..13814c9da 100644 --- a/src/svm/libsvm.jl +++ b/src/svm/libsvm.jl @@ -118,7 +118,6 @@ function fit!( # svm = info(string("INFO Finished training LIBSVM.jl model.")) estimator.underlyingsvm = svm - @assert(typeof(estimator.underlyingsvm.labels) <: AbstractVector) estimator.levels = estimator.underlyingsvm.labels return estimator end diff --git a/src/utils/probabilitiestopredictions.jl b/src/utils/probabilitiestopredictions.jl index f1232e10b..ec00f0d03 100644 --- a/src/utils/probabilitiestopredictions.jl +++ b/src/utils/probabilitiestopredictions.jl @@ -22,13 +22,11 @@ function singlelabelprobabilitiestopredictions( ) classes = sort(unique(collect(keys(probabilitiesassoc)))) numclasses = length(classes) - @assert(typeof(probabilitiesassoc[classes[1]]) <: AbstractVector) numrows = size(probabilitiesassoc[classes[1]], 1) probabilitiesmatrix = Matrix{floattype}(numrows, numclasses) for j = 1:numclasses probabilitiesmatrix[:, j] = floattype.(probabilitiesassoc[classes[j]]) end - @assert( all( isapprox.( sum(probabilitiesmatrix, 2) , 1.0 ) ) ) predictionsvector = Vector{String}(numrows) for i = 1:numrows predictionsvector[i] = diff --git a/test/cpu/functional/breastcancerbiopsy/run_breastcancerbiopsy.jl b/test/cpu/functional/breastcancerbiopsy/run_breastcancerbiopsy.jl index 115e1f3d2..b44d75554 100644 --- a/test/cpu/functional/breastcancerbiopsy/run_breastcancerbiopsy.jl +++ b/test/cpu/functional/breastcancerbiopsy/run_breastcancerbiopsy.jl @@ -495,8 +495,6 @@ function knetmlp_predict( if probabilities normalizedlogprobs = Knet.logp(unnormalizedlogprobs, 1) normalizedprobs = exp.(normalizedlogprobs) - @assert(all(0 .<= normalizedprobs .<= 1)) - @assert(all(isapprox.(sum(normalizedprobs, 1),1.0;atol = 0.00001,))) return normalizedprobs else return unnormalizedlogprobs From 61a5cfcbc1a147866ffcdc6cc7e409f4aa050e1e Mon Sep 17 00:00:00 2001 From: Dilum Aluthge Date: Sat, 19 May 2018 23:01:50 -0400 Subject: [PATCH 06/62] Change how we split data --- examples/boston_housing.jl | 128 ++++----- .../boston_housing_linear_regression.ipynb | 36 +-- .../boston_housing_linear_regression.jl | 36 +-- .../boston_housing_metric_comparison.ipynb | 16 +- .../boston_housing_metric_comparison.jl | 16 +- .../boston_housing/boston_housing_mlp.ipynb | 36 +-- examples/boston_housing/boston_housing_mlp.jl | 36 +-- .../boston_housing_random_forest.ipynb | 36 +-- .../boston_housing_random_forest.jl | 36 +-- .../boston_housing/boston_housing_svm.ipynb | 58 ++-- examples/boston_housing/boston_housing_svm.jl | 58 ++-- examples/breast_cancer_biopsy.jl | 230 ++++++++-------- src/classimbalance/smote.jl | 32 +-- src/linearmodel/glm.jl | 22 +- src/metrics/risk_score_cutoff_values.jl | 8 +- .../singlelabelbinaryclassificationmetrics.jl | 24 +- src/metrics/singlelabelregressionmetrics.jl | 24 +- src/modelselection/split_data.jl | 43 ++- src/plotting/plotprcurve.jl | 16 +- src/plotting/plotroccurve.jl | 16 +- ...glelabelbinaryclassclassifierhistograms.jl | 8 +- ...lotsinglelabelregressiontruevspredicted.jl | 8 +- src/plotting/probability_calibration_plots.jl | 32 +-- src/preprocessing/dataframetodecisiontree.jl | 26 +- src/preprocessing/dataframetoglm.jl | 30 +- src/preprocessing/dataframetoknet.jl | 26 +- .../bostonhousing/run_bostonhousing.jl | 165 ++++++----- .../run_breastcancerbiopsy.jl | 257 +++++++++--------- 28 files changed, 725 insertions(+), 734 deletions(-) diff --git a/examples/boston_housing.jl b/examples/boston_housing.jl index 7f59a1463..fc0babb6f 100644 --- a/examples/boston_housing.jl +++ b/examples/boston_housing.jl @@ -101,15 +101,15 @@ end labelname = :MedV # Put features and labels in separate dataframes -featuresdf = df[featurenames] -labelsdf = df[[labelname]] +features_df = df[featurenames] +labels_df = df[[labelname]] # View summary statistics for label variable (mean, quartiles, etc.) -DataFrames.describe(labelsdf[labelname]) +DataFrames.describe(labels_df[labelname]) # Split data into training set (70%) and testing set (30%) -trainingfeaturesdf,testingfeaturesdf,traininglabelsdf,testinglabelsdf = - PredictMD.split_data(featuresdf,labelsdf,0.7) +training_features_df,testing_features_df,traininglabels_df,testing_labels_df = + PredictMD.split_data(features_df,labels_df,0.7) ############################################################################## ############################################################################## @@ -136,7 +136,7 @@ else # set feature contrasts PredictMD.set_feature_contrasts!(linearreg, feature_contrasts) # Train linear regression model - PredictMD.fit!(linearreg,trainingfeaturesdf,traininglabelsdf,) + PredictMD.fit!(linearreg,training_features_df,traininglabels_df,) end # View coefficients, p values, etc. for underlying linear regression @@ -145,8 +145,8 @@ PredictMD.get_underlying(linearreg) # Plot true values versus predicted values for linear regression on training set linearreg_plot_training = PredictMD.plotsinglelabelregressiontrueversuspredicted( linearreg, - trainingfeaturesdf, - traininglabelsdf, + training_features_df, + traininglabels_df, labelname, ) PredictMD.open(linearreg_plot_training) @@ -154,8 +154,8 @@ PredictMD.open(linearreg_plot_training) # Plot true values versus predicted values for linear regression on testing set linearreg_plot_testing = PredictMD.plotsinglelabelregressiontrueversuspredicted( linearreg, - testingfeaturesdf, - testinglabelsdf, + testing_features_df, + testing_labels_df, labelname ) PredictMD.open(linearreg_plot_testing) @@ -163,16 +163,16 @@ PredictMD.open(linearreg_plot_testing) # Evaluate performance of linear regression on training set PredictMD.singlelabelregressionmetrics( linearreg, - trainingfeaturesdf, - traininglabelsdf, + training_features_df, + traininglabels_df, labelname, ) # Evaluate performance of linear regression on testing set PredictMD.singlelabelregressionmetrics( linearreg, - testingfeaturesdf, - testinglabelsdf, + testing_features_df, + testing_labels_df, labelname, ) @@ -196,14 +196,14 @@ else # set feature contrasts PredictMD.set_feature_contrasts!(randomforestreg, feature_contrasts) # Train random forest model on training set - PredictMD.fit!(randomforestreg,trainingfeaturesdf,traininglabelsdf,) + PredictMD.fit!(randomforestreg,training_features_df,traininglabels_df,) end # Plot true values versus predicted values for random forest on training set randomforestreg_plot_training = PredictMD.plotsinglelabelregressiontrueversuspredicted( randomforestreg, - trainingfeaturesdf, - traininglabelsdf, + training_features_df, + traininglabels_df, labelname, ) PredictMD.open(randomforestreg_plot_training) @@ -211,8 +211,8 @@ PredictMD.open(randomforestreg_plot_training) # Plot true values versus predicted values for random forest on testing set randomforestreg_plot_testing = PredictMD.plotsinglelabelregressiontrueversuspredicted( randomforestreg, - testingfeaturesdf, - testinglabelsdf, + testing_features_df, + testing_labels_df, labelname, ) PredictMD.open(randomforestreg_plot_testing) @@ -220,16 +220,16 @@ PredictMD.open(randomforestreg_plot_testing) # Evaluate performance of random forest on training set PredictMD.singlelabelregressionmetrics( randomforestreg, - trainingfeaturesdf, - traininglabelsdf, + training_features_df, + traininglabels_df, labelname, ) # Evaluate performance of random forest on testing set PredictMD.singlelabelregressionmetrics( randomforestreg, - testingfeaturesdf, - testinglabelsdf, + testing_features_df, + testing_labels_df, labelname, ) @@ -254,14 +254,14 @@ else # set feature contrasts PredictMD.set_feature_contrasts!(epsilonsvr_svmreg, feature_contrasts) # Train epsilon-SVR model on training set - PredictMD.fit!(epsilonsvr_svmreg,trainingfeaturesdf,traininglabelsdf,) + PredictMD.fit!(epsilonsvr_svmreg,training_features_df,traininglabels_df,) end # Plot true values versus predicted values for epsilon-SVR on training set epsilonsvr_svmreg_plot_training = PredictMD.plotsinglelabelregressiontrueversuspredicted( epsilonsvr_svmreg, - trainingfeaturesdf, - traininglabelsdf, + training_features_df, + traininglabels_df, labelname, ) PredictMD.open(epsilonsvr_svmreg_plot_training) @@ -269,8 +269,8 @@ PredictMD.open(epsilonsvr_svmreg_plot_training) # Plot true values versus predicted values for epsilon-SVR on testing set epsilonsvr_svmreg_plot_testing = PredictMD.plotsinglelabelregressiontrueversuspredicted( epsilonsvr_svmreg, - testingfeaturesdf, - testinglabelsdf, + testing_features_df, + testing_labels_df, labelname, ) PredictMD.open(epsilonsvr_svmreg_plot_testing) @@ -278,16 +278,16 @@ PredictMD.open(epsilonsvr_svmreg_plot_testing) # Evaluate performance of epsilon-SVR on training set PredictMD.singlelabelregressionmetrics( epsilonsvr_svmreg, - trainingfeaturesdf, - traininglabelsdf, + training_features_df, + traininglabels_df, labelname, ) # Evaluate performance of epsilon-SVR on testing set PredictMD.singlelabelregressionmetrics( epsilonsvr_svmreg, - testingfeaturesdf, - testinglabelsdf, + testing_features_df, + testing_labels_df, labelname, ) @@ -312,14 +312,14 @@ else # set feature contrasts PredictMD.set_feature_contrasts!(nusvr_svmreg, feature_contrasts) # Train nu-SVR model - PredictMD.fit!(nusvr_svmreg,trainingfeaturesdf,traininglabelsdf,) + PredictMD.fit!(nusvr_svmreg,training_features_df,traininglabels_df,) end # Plot true values versus predicted values for nu-SVR on training set nusvr_svmreg_plot_training = PredictMD.plotsinglelabelregressiontrueversuspredicted( nusvr_svmreg, - trainingfeaturesdf, - traininglabelsdf, + training_features_df, + traininglabels_df, labelname, ) PredictMD.open(nusvr_svmreg_plot_training) @@ -327,8 +327,8 @@ PredictMD.open(nusvr_svmreg_plot_training) # Plot true values versus predicted values for nu-SVR on testing set nusvr_svmreg_plot_testing = PredictMD.plotsinglelabelregressiontrueversuspredicted( nusvr_svmreg, - testingfeaturesdf, - testinglabelsdf, + testing_features_df, + testing_labels_df, labelname, ) PredictMD.open(nusvr_svmreg_plot_testing) @@ -336,16 +336,16 @@ PredictMD.open(nusvr_svmreg_plot_testing) # Evaluate performance of nu-SVR on training set PredictMD.singlelabelregressionmetrics( nusvr_svmreg, - trainingfeaturesdf, - traininglabelsdf, + training_features_df, + traininglabels_df, labelname, ) # Evaluate performance of nu-SVR on testing set PredictMD.singlelabelregressionmetrics( nusvr_svmreg, - testingfeaturesdf, - testinglabelsdf, + testing_features_df, + testing_labels_df, labelname, ) @@ -458,7 +458,7 @@ else # set feature contrasts PredictMD.set_feature_contrasts!(knetmlpreg, feature_contrasts) # Train multilayer perceptron model on training set - PredictMD.fit!(knetmlpreg,trainingfeaturesdf,traininglabelsdf,) + PredictMD.fit!(knetmlpreg,training_features_df,traininglabels_df,) end # Plot learning curve: loss vs. epoch @@ -500,8 +500,8 @@ PredictMD.open(knet_learningcurve_lossvsiteration_skip100iterations) # Plot true values versus predicted values for multilayer perceptron on training set knetmlpreg_plot_training = PredictMD.plotsinglelabelregressiontrueversuspredicted( knetmlpreg, - trainingfeaturesdf, - traininglabelsdf, + training_features_df, + traininglabels_df, labelname, ) PredictMD.open(knetmlpreg_plot_training) @@ -509,8 +509,8 @@ PredictMD.open(knetmlpreg_plot_training) # Plot true values versus predicted values for multilayer perceptron on testing set knetmlpreg_plot_testing = PredictMD.plotsinglelabelregressiontrueversuspredicted( knetmlpreg, - testingfeaturesdf, - testinglabelsdf, + testing_features_df, + testing_labels_df, labelname, ) PredictMD.open(knetmlpreg_plot_testing) @@ -518,16 +518,16 @@ PredictMD.open(knetmlpreg_plot_testing) # Evaluate performance of multilayer perceptron on training set PredictMD.singlelabelregressionmetrics( knetmlpreg, - trainingfeaturesdf, - traininglabelsdf, + training_features_df, + traininglabels_df, labelname, ) # Evaluate performance of multilayer perceptron on testing set PredictMD.singlelabelregressionmetrics( knetmlpreg, - testingfeaturesdf, - testinglabelsdf, + testing_features_df, + testing_labels_df, labelname, ) @@ -546,8 +546,8 @@ showall(PredictMD.singlelabelregressionmetrics( nusvr_svmreg, knetmlpreg, ], - trainingfeaturesdf, - traininglabelsdf, + training_features_df, + traininglabels_df, labelname, )) @@ -560,8 +560,8 @@ showall(PredictMD.singlelabelregressionmetrics( nusvr_svmreg, knetmlpreg, ], - testingfeaturesdf, - testinglabelsdf, + testing_features_df, + testing_labels_df, labelname, )) @@ -589,15 +589,15 @@ end # output by each of regression models. # Get real-valued predictions from each model for training set -PredictMD.predict(linearreg,trainingfeaturesdf,) -PredictMD.predict(randomforestreg,trainingfeaturesdf,) -PredictMD.predict(epsilonsvr_svmreg,trainingfeaturesdf,) -PredictMD.predict(nusvr_svmreg,trainingfeaturesdf,) -PredictMD.predict(knetmlpreg,trainingfeaturesdf,) +PredictMD.predict(linearreg,training_features_df,) +PredictMD.predict(randomforestreg,training_features_df,) +PredictMD.predict(epsilonsvr_svmreg,training_features_df,) +PredictMD.predict(nusvr_svmreg,training_features_df,) +PredictMD.predict(knetmlpreg,training_features_df,) # Get real-valued predictions from each model for testing set -PredictMD.predict(linearreg,testingfeaturesdf,) -PredictMD.predict(randomforestreg,testingfeaturesdf,) -PredictMD.predict(epsilonsvr_svmreg,testingfeaturesdf,) -PredictMD.predict(nusvr_svmreg,testingfeaturesdf,) -PredictMD.predict(knetmlpreg,testingfeaturesdf,) +PredictMD.predict(linearreg,testing_features_df,) +PredictMD.predict(randomforestreg,testing_features_df,) +PredictMD.predict(epsilonsvr_svmreg,testing_features_df,) +PredictMD.predict(nusvr_svmreg,testing_features_df,) +PredictMD.predict(knetmlpreg,testing_features_df,) diff --git a/examples/boston_housing/boston_housing_linear_regression.ipynb b/examples/boston_housing/boston_housing_linear_regression.ipynb index 27bd9c85e..c7c5b8a71 100644 --- a/examples/boston_housing/boston_housing_linear_regression.ipynb +++ b/examples/boston_housing/boston_housing_linear_regression.ipynb @@ -316,12 +316,12 @@ "labelname = :MedV\n", "\n", "# Put features and labels in separate dataframes\n", - "featuresdf = df[featurenames]\n", - "labelsdf = df[[labelname]]\n", + "features_df = df[featurenames]\n", + "labels_df = df[[labelname]]\n", "\n", "# Display for exploration\n", - "display(DataFrames.head(featuresdf))\n", - "display(DataFrames.head(labelsdf))" + "display(DataFrames.head(features_df))\n", + "display(DataFrames.head(labels_df))" ] }, { @@ -349,7 +349,7 @@ ], "source": [ "# View summary statistics for label variable (mean, quartiles, etc.)\n", - "DataFrames.describe(labelsdf[labelname])" + "DataFrames.describe(labels_df[labelname])" ] }, { @@ -359,8 +359,8 @@ "outputs": [], "source": [ "# Split data into training set (70%) and testing set (30%)\n", - "trainingfeaturesdf,testingfeaturesdf,traininglabelsdf,testinglabelsdf =\n", - " PredictMD.split_data(featuresdf,labelsdf;training = 0.7,testing = 0.3,);" + "training_features_df,testing_features_df,traininglabels_df,testing_labels_df =\n", + " PredictMD.split_data(features_df,labels_df;training = 0.7,testing = 0.3,);" ] }, { @@ -438,7 +438,7 @@ " # set feature contrasts\n", " PredictMD.set_feature_contrasts!(linearreg, contrasts)\n", " # Train linear regression model\n", - " PredictMD.fit!(linearreg,trainingfeaturesdf,traininglabelsdf,)\n", + " PredictMD.fit!(linearreg,training_features_df,traininglabels_df,)\n", "end" ] }, @@ -1035,8 +1035,8 @@ "# Plot true values versus predicted values for linear regression on training set\n", "linearreg_plot_training = PredictMD.plotsinglelabelregressiontrueversuspredicted(\n", " linearreg,\n", - " trainingfeaturesdf,\n", - " traininglabelsdf,\n", + " training_features_df,\n", + " traininglabels_df,\n", " labelname,\n", " )\n", "# PredictMD.open(linearreg_plot_training)" @@ -1381,8 +1381,8 @@ "# Plot true values versus predicted values for linear regression on testing set\n", "linearreg_plot_testing = PredictMD.plotsinglelabelregressiontrueversuspredicted(\n", " linearreg,\n", - " testingfeaturesdf,\n", - " testinglabelsdf,\n", + " testing_features_df,\n", + " testing_labels_df,\n", " labelname\n", " )\n", "# PredictMD.open(linearreg_plot_testing)" @@ -1414,8 +1414,8 @@ "# Evaluate performance of linear regression on training set\n", "PredictMD.singlelabelregressionmetrics(\n", " linearreg,\n", - " trainingfeaturesdf,\n", - " traininglabelsdf,\n", + " training_features_df,\n", + " traininglabels_df,\n", " labelname,\n", " )" ] @@ -1446,8 +1446,8 @@ "# Evaluate performance of linear regression on testing set\n", "PredictMD.singlelabelregressionmetrics(\n", " linearreg,\n", - " testingfeaturesdf,\n", - " testinglabelsdf,\n", + " testing_features_df,\n", + " testing_labels_df,\n", " labelname,\n", " )" ] @@ -1536,10 +1536,10 @@ "# output by each of regression models.\n", "\n", "# Get real-valued predictions from each model for training set\n", - "PredictMD.predict(linearreg,trainingfeaturesdf,)\n", + "PredictMD.predict(linearreg,training_features_df,)\n", "\n", "# Get real-valued predictions from each model for testing set\n", - "PredictMD.predict(linearreg,testingfeaturesdf,)" + "PredictMD.predict(linearreg,testing_features_df,)" ] } ], diff --git a/examples/boston_housing/boston_housing_linear_regression.jl b/examples/boston_housing/boston_housing_linear_regression.jl index 56ab68a0b..827e11d09 100644 --- a/examples/boston_housing/boston_housing_linear_regression.jl +++ b/examples/boston_housing/boston_housing_linear_regression.jl @@ -66,19 +66,19 @@ end labelname = :MedV # Put features and labels in separate dataframes -featuresdf = df[featurenames] -labelsdf = df[[labelname]] +features_df = df[featurenames] +labels_df = df[[labelname]] # Display for exploration -display(DataFrames.head(featuresdf)) -display(DataFrames.head(labelsdf)) +display(DataFrames.head(features_df)) +display(DataFrames.head(labels_df)) # View summary statistics for label variable (mean, quartiles, etc.) -DataFrames.describe(labelsdf[labelname]) +DataFrames.describe(labels_df[labelname]) # Split data into training set (70%) and testing set (30%) -trainingfeaturesdf,testingfeaturesdf,traininglabelsdf,testinglabelsdf = - PredictMD.split_data(featuresdf,labelsdf,0.7); +training_features_df,testing_features_df,traininglabels_df,testing_labels_df = + PredictMD.split_data(features_df,labels_df,0.7); # Set up linear regression model linearreg = PredictMD.singlelabeldataframelinearregression( @@ -95,7 +95,7 @@ else # set feature contrasts PredictMD.set_feature_contrasts!(linearreg , feature_contrasts) # Train linear regression model - PredictMD.fit!(linearreg,trainingfeaturesdf,traininglabelsdf,) + PredictMD.fit!(linearreg,training_features_df,traininglabels_df,) end # View coefficients, p values, etc. for underlying linear regression @@ -104,8 +104,8 @@ PredictMD.get_underlying(linearreg) # Plot true values versus predicted values for linear regression on training set linearreg_plot_training = PredictMD.plotsinglelabelregressiontrueversuspredicted( linearreg, - trainingfeaturesdf, - traininglabelsdf, + training_features_df, + traininglabels_df, labelname, ) # PredictMD.open(linearreg_plot_training) @@ -113,8 +113,8 @@ linearreg_plot_training = PredictMD.plotsinglelabelregressiontrueversuspredicted # Plot true values versus predicted values for linear regression on testing set linearreg_plot_testing = PredictMD.plotsinglelabelregressiontrueversuspredicted( linearreg, - testingfeaturesdf, - testinglabelsdf, + testing_features_df, + testing_labels_df, labelname ) # PredictMD.open(linearreg_plot_testing) @@ -122,16 +122,16 @@ linearreg_plot_testing = PredictMD.plotsinglelabelregressiontrueversuspredicted( # Evaluate performance of linear regression on training set PredictMD.singlelabelregressionmetrics( linearreg, - trainingfeaturesdf, - traininglabelsdf, + training_features_df, + traininglabels_df, labelname, ) # Evaluate performance of linear regression on testing set PredictMD.singlelabelregressionmetrics( linearreg, - testingfeaturesdf, - testinglabelsdf, + testing_features_df, + testing_labels_df, labelname, ) @@ -143,7 +143,7 @@ end # output by each of regression models. # Get real-valued predictions from each model for training set -PredictMD.predict(linearreg,trainingfeaturesdf,) +PredictMD.predict(linearreg,training_features_df,) # Get real-valued predictions from each model for testing set -PredictMD.predict(linearreg,testingfeaturesdf,) +PredictMD.predict(linearreg,testing_features_df,) diff --git a/examples/boston_housing/boston_housing_metric_comparison.ipynb b/examples/boston_housing/boston_housing_metric_comparison.ipynb index 143dd70af..3dd9e7355 100644 --- a/examples/boston_housing/boston_housing_metric_comparison.ipynb +++ b/examples/boston_housing/boston_housing_metric_comparison.ipynb @@ -92,12 +92,12 @@ "labelname = :MedV\n", "\n", "# Put features and labels in separate dataframes\n", - "featuresdf = df[featurenames]\n", - "labelsdf = df[[labelname]]\n", + "features_df = df[featurenames]\n", + "labels_df = df[[labelname]]\n", "\n", "# Split data into training set (70%) and testing set (30%)\n", - "trainingfeaturesdf,testingfeaturesdf,traininglabelsdf,testinglabelsdf =\n", - " PredictMD.split_data(featuresdf,labelsdf;training = 0.7,testing = 0.3,);" + "training_features_df,testing_features_df,traininglabels_df,testing_labels_df =\n", + " PredictMD.split_data(features_df,labels_df;training = 0.7,testing = 0.3,);" ] }, { @@ -343,8 +343,8 @@ " nusvr_svmreg,\n", " knetmlpreg,\n", " ],\n", - " trainingfeaturesdf,\n", - " traininglabelsdf,\n", + " training_features_df,\n", + " traininglabels_df,\n", " labelname,\n", " ))\n", "\n", @@ -357,8 +357,8 @@ " nusvr_svmreg,\n", " knetmlpreg,\n", " ],\n", - " testingfeaturesdf,\n", - " testinglabelsdf,\n", + " testing_features_df,\n", + " testing_labels_df,\n", " labelname,\n", " ))" ] diff --git a/examples/boston_housing/boston_housing_metric_comparison.jl b/examples/boston_housing/boston_housing_metric_comparison.jl index 066deff87..eb3cb6fbb 100644 --- a/examples/boston_housing/boston_housing_metric_comparison.jl +++ b/examples/boston_housing/boston_housing_metric_comparison.jl @@ -44,12 +44,12 @@ featurenames = Symbol[ labelname = :MedV # Put features and labels in separate dataframes -featuresdf = df[featurenames] -labelsdf = df[[labelname]] +features_df = df[featurenames] +labels_df = df[[labelname]] # Split data into training set (70%) and testing set (30%) -trainingfeaturesdf,testingfeaturesdf,traininglabelsdf,testinglabelsdf = - PredictMD.split_data(featuresdf,labelsdf,0.7); +training_features_df,testing_features_df,traininglabels_df,testing_labels_df = + PredictMD.split_data(features_df,labels_df,0.7); # load pre-trained models linearreg_filename = "./linearreg.jld2" @@ -194,8 +194,8 @@ showall(PredictMD.singlelabelregressionmetrics( nusvr_svmreg, knetmlpreg, ], - trainingfeaturesdf, - traininglabelsdf, + training_features_df, + traininglabels_df, labelname, )) @@ -208,7 +208,7 @@ showall(PredictMD.singlelabelregressionmetrics( nusvr_svmreg, knetmlpreg, ], - testingfeaturesdf, - testinglabelsdf, + testing_features_df, + testing_labels_df, labelname, )) diff --git a/examples/boston_housing/boston_housing_mlp.ipynb b/examples/boston_housing/boston_housing_mlp.ipynb index cf1c7e54a..2f3bf980d 100644 --- a/examples/boston_housing/boston_housing_mlp.ipynb +++ b/examples/boston_housing/boston_housing_mlp.ipynb @@ -299,12 +299,12 @@ "labelname = :MedV\n", "\n", "# Put features and labels in separate dataframes\n", - "featuresdf = df[featurenames]\n", - "labelsdf = df[[labelname]]\n", + "features_df = df[featurenames]\n", + "labels_df = df[[labelname]]\n", "\n", "# Display for exploration\n", - "display(DataFrames.head(featuresdf))\n", - "display(DataFrames.head(labelsdf))" + "display(DataFrames.head(features_df))\n", + "display(DataFrames.head(labels_df))" ] }, { @@ -332,7 +332,7 @@ ], "source": [ "# View summary statistics for label variable (mean, quartiles, etc.)\n", - "DataFrames.describe(labelsdf[labelname])" + "DataFrames.describe(labels_df[labelname])" ] }, { @@ -342,8 +342,8 @@ "outputs": [], "source": [ "# Split data into training set (70%) and testing set (30%)\n", - "trainingfeaturesdf,testingfeaturesdf,traininglabelsdf,testinglabelsdf =\n", - " PredictMD.split_data(featuresdf,labelsdf;training = 0.7,testing = 0.3,);" + "training_features_df,testing_features_df,traininglabels_df,testing_labels_df =\n", + " PredictMD.split_data(features_df,labels_df;training = 0.7,testing = 0.3,);" ] }, { @@ -558,7 +558,7 @@ " # set feature contrasts\n", " PredictMD.set_feature_contrasts!(knetmlpreg, contrasts)\n", " # Train multilayer perceptron model on training set\n", - " PredictMD.fit!(knetmlpreg,trainingfeaturesdf,traininglabelsdf,)\n", + " PredictMD.fit!(knetmlpreg,training_features_df,traininglabels_df,)\n", "end" ] }, @@ -2048,8 +2048,8 @@ "# Plot true values versus predicted values for multilayer perceptron on training set\n", "knetmlpreg_plot_training = PredictMD.plotsinglelabelregressiontrueversuspredicted(\n", " knetmlpreg,\n", - " trainingfeaturesdf,\n", - " traininglabelsdf,\n", + " training_features_df,\n", + " traininglabels_df,\n", " labelname,\n", " )" ] @@ -2393,8 +2393,8 @@ "# Plot true values versus predicted values for multilayer perceptron on testing set\n", "knetmlpreg_plot_testing = PredictMD.plotsinglelabelregressiontrueversuspredicted(\n", " knetmlpreg,\n", - " testingfeaturesdf,\n", - " testinglabelsdf,\n", + " testing_features_df,\n", + " testing_labels_df,\n", " labelname,\n", " )" ] @@ -2425,8 +2425,8 @@ "# Evaluate performance of multilayer perceptron on training set\n", "PredictMD.singlelabelregressionmetrics(\n", " knetmlpreg,\n", - " trainingfeaturesdf,\n", - " traininglabelsdf,\n", + " training_features_df,\n", + " traininglabels_df,\n", " labelname,\n", " )" ] @@ -2457,8 +2457,8 @@ "# Evaluate performance of multilayer perceptron on testing set\n", "PredictMD.singlelabelregressionmetrics(\n", " knetmlpreg,\n", - " testingfeaturesdf,\n", - " testinglabelsdf,\n", + " testing_features_df,\n", + " testing_labels_df,\n", " labelname,\n", " )" ] @@ -2547,7 +2547,7 @@ "# output by each of regression models.\n", "\n", "# Get real-valued predictions from each model for training set\n", - "PredictMD.predict(knetmlpreg,trainingfeaturesdf)" + "PredictMD.predict(knetmlpreg,training_features_df)" ] }, { @@ -2597,7 +2597,7 @@ ], "source": [ "# Get real-valued predictions from each model for testing set\n", - "PredictMD.predict(knetmlpreg,testingfeaturesdf)" + "PredictMD.predict(knetmlpreg,testing_features_df)" ] } ], diff --git a/examples/boston_housing/boston_housing_mlp.jl b/examples/boston_housing/boston_housing_mlp.jl index 5a9ae3274..af22d212f 100644 --- a/examples/boston_housing/boston_housing_mlp.jl +++ b/examples/boston_housing/boston_housing_mlp.jl @@ -63,19 +63,19 @@ end labelname = :MedV # Put features and labels in separate dataframes -featuresdf = df[featurenames] -labelsdf = df[[labelname]] +features_df = df[featurenames] +labels_df = df[[labelname]] # Display for exploration -display(DataFrames.head(featuresdf)) -display(DataFrames.head(labelsdf)) +display(DataFrames.head(features_df)) +display(DataFrames.head(labels_df)) # View summary statistics for label variable (mean, quartiles, etc.) -DataFrames.describe(labelsdf[labelname]) +DataFrames.describe(labels_df[labelname]) # Split data into training set (70%) and testing set (30%) -trainingfeaturesdf,testingfeaturesdf,traininglabelsdf,testinglabelsdf = - PredictMD.split_data(featuresdf,labelsdf,0.7); +training_features_df,testing_features_df,traininglabels_df,testing_labels_df = + PredictMD.split_data(features_df,labels_df,0.7); # Define predict function function knetmlp_predict( @@ -182,7 +182,7 @@ else # set feature contrasts PredictMD.set_feature_contrasts!(knetmlpreg , feature_contrasts) # Train multilayer perceptron model on training set - PredictMD.fit!(knetmlpreg,trainingfeaturesdf,traininglabelsdf,) + PredictMD.fit!(knetmlpreg,training_features_df,traininglabels_df,) end # Plot learning curve: loss vs. epoch @@ -220,32 +220,32 @@ knet_learningcurve_lossvsiteration_skip100iterations = PredictMD.plotlearningcur # Plot true values versus predicted values for multilayer perceptron on training set knetmlpreg_plot_training = PredictMD.plotsinglelabelregressiontrueversuspredicted( knetmlpreg, - trainingfeaturesdf, - traininglabelsdf, + training_features_df, + traininglabels_df, labelname, ) # Plot true values versus predicted values for multilayer perceptron on testing set knetmlpreg_plot_testing = PredictMD.plotsinglelabelregressiontrueversuspredicted( knetmlpreg, - testingfeaturesdf, - testinglabelsdf, + testing_features_df, + testing_labels_df, labelname, ) # Evaluate performance of multilayer perceptron on training set PredictMD.singlelabelregressionmetrics( knetmlpreg, - trainingfeaturesdf, - traininglabelsdf, + training_features_df, + traininglabels_df, labelname, ) # Evaluate performance of multilayer perceptron on testing set PredictMD.singlelabelregressionmetrics( knetmlpreg, - testingfeaturesdf, - testinglabelsdf, + testing_features_df, + testing_labels_df, labelname, ) @@ -257,7 +257,7 @@ end # output by each of regression models. # Get real-valued predictions from each model for training set -PredictMD.predict(knetmlpreg,trainingfeaturesdf) +PredictMD.predict(knetmlpreg,training_features_df) # Get real-valued predictions from each model for testing set -PredictMD.predict(knetmlpreg,testingfeaturesdf) +PredictMD.predict(knetmlpreg,testing_features_df) diff --git a/examples/boston_housing/boston_housing_random_forest.ipynb b/examples/boston_housing/boston_housing_random_forest.ipynb index dfabd8c71..6e4c2f1d0 100644 --- a/examples/boston_housing/boston_housing_random_forest.ipynb +++ b/examples/boston_housing/boston_housing_random_forest.ipynb @@ -287,12 +287,12 @@ "labelname = :MedV\n", "\n", "# Put features and labels in separate dataframes\n", - "featuresdf = df[featurenames]\n", - "labelsdf = df[[labelname]]\n", + "features_df = df[featurenames]\n", + "labels_df = df[[labelname]]\n", "\n", "# Display for exploration\n", - "display(DataFrames.head(featuresdf))\n", - "display(DataFrames.head(labelsdf))" + "display(DataFrames.head(features_df))\n", + "display(DataFrames.head(labels_df))" ] }, { @@ -320,7 +320,7 @@ ], "source": [ "# View summary statistics for label variable (mean, quartiles, etc.)\n", - "DataFrames.describe(labelsdf[labelname])" + "DataFrames.describe(labels_df[labelname])" ] }, { @@ -330,8 +330,8 @@ "outputs": [], "source": [ "# Split data into training set (70%) and testing set (30%)\n", - "trainingfeaturesdf,testingfeaturesdf,traininglabelsdf,testinglabelsdf =\n", - " PredictMD.split_data(featuresdf,labelsdf;training = 0.7,testing = 0.3,);" + "training_features_df,testing_features_df,traininglabels_df,testing_labels_df =\n", + " PredictMD.split_data(features_df,labels_df;training = 0.7,testing = 0.3,);" ] }, { @@ -393,7 +393,7 @@ " # set feature contrasts\n", " PredictMD.set_feature_contrasts!(randomforestreg, contrasts)\n", " # Train random forest model on training set\n", - " PredictMD.fit!(randomforestreg,trainingfeaturesdf,traininglabelsdf,)\n", + " PredictMD.fit!(randomforestreg,training_features_df,traininglabels_df,)\n", "end" ] }, @@ -947,8 +947,8 @@ "# Plot true values versus predicted values for random forest on training set\n", "randomforestreg_plot_training = PredictMD.plotsinglelabelregressiontrueversuspredicted(\n", " randomforestreg,\n", - " trainingfeaturesdf,\n", - " traininglabelsdf,\n", + " training_features_df,\n", + " traininglabels_df,\n", " labelname,\n", " )" ] @@ -1301,8 +1301,8 @@ "# Plot true values versus predicted values for random forest on testing set\n", "randomforestreg_plot_testing = PredictMD.plotsinglelabelregressiontrueversuspredicted(\n", " randomforestreg,\n", - " testingfeaturesdf,\n", - " testinglabelsdf,\n", + " testing_features_df,\n", + " testing_labels_df,\n", " labelname,\n", " )" ] @@ -1333,8 +1333,8 @@ "# Evaluate performance of random forest on training set\n", "PredictMD.singlelabelregressionmetrics(\n", " randomforestreg,\n", - " trainingfeaturesdf,\n", - " traininglabelsdf,\n", + " training_features_df,\n", + " traininglabels_df,\n", " labelname,\n", " )" ] @@ -1365,8 +1365,8 @@ "# Evaluate performance of random forest on testing set\n", "PredictMD.singlelabelregressionmetrics(\n", " randomforestreg,\n", - " testingfeaturesdf,\n", - " testinglabelsdf,\n", + " testing_features_df,\n", + " testing_labels_df,\n", " labelname,\n", " )" ] @@ -1455,7 +1455,7 @@ "# output by each of regression models.\n", "\n", "# Get real-valued predictions from each model for training set\n", - "PredictMD.predict(randomforestreg,trainingfeaturesdf)" + "PredictMD.predict(randomforestreg,training_features_df)" ] }, { @@ -1505,7 +1505,7 @@ ], "source": [ "# Get real-valued predictions from each model for testing set\n", - "PredictMD.predict(randomforestreg,testingfeaturesdf)" + "PredictMD.predict(randomforestreg,testing_features_df)" ] }, { diff --git a/examples/boston_housing/boston_housing_random_forest.jl b/examples/boston_housing/boston_housing_random_forest.jl index 844fddbe6..8f22d3aca 100644 --- a/examples/boston_housing/boston_housing_random_forest.jl +++ b/examples/boston_housing/boston_housing_random_forest.jl @@ -62,19 +62,19 @@ end labelname = :MedV # Put features and labels in separate dataframes -featuresdf = df[featurenames] -labelsdf = df[[labelname]] +features_df = df[featurenames] +labels_df = df[[labelname]] # Display for exploration -display(DataFrames.head(featuresdf)) -display(DataFrames.head(labelsdf)) +display(DataFrames.head(features_df)) +display(DataFrames.head(labels_df)) # View summary statistics for label variable (mean, quartiles, etc.) -DataFrames.describe(labelsdf[labelname]) +DataFrames.describe(labels_df[labelname]) # Split data into training set (70%) and testing set (30%) -trainingfeaturesdf,testingfeaturesdf,traininglabelsdf,testinglabelsdf = - PredictMD.split_data(featuresdf,labelsdf,0.7); +training_features_df,testing_features_df,traininglabels_df,testing_labels_df = + PredictMD.split_data(features_df,labels_df,0.7); # Set up random forest regression model randomforestreg = PredictMD.singlelabeldataframerandomforestregression( @@ -92,38 +92,38 @@ else # set feature contrasts PredictMD.set_feature_contrasts!(randomforestreg , feature_contrasts) # Train random forest model on training set - PredictMD.fit!(randomforestreg,trainingfeaturesdf,traininglabelsdf,) + PredictMD.fit!(randomforestreg,training_features_df,traininglabels_df,) end # Plot true values versus predicted values for random forest on training set randomforestreg_plot_training = PredictMD.plotsinglelabelregressiontrueversuspredicted( randomforestreg, - trainingfeaturesdf, - traininglabelsdf, + training_features_df, + traininglabels_df, labelname, ) # Plot true values versus predicted values for random forest on testing set randomforestreg_plot_testing = PredictMD.plotsinglelabelregressiontrueversuspredicted( randomforestreg, - testingfeaturesdf, - testinglabelsdf, + testing_features_df, + testing_labels_df, labelname, ) # Evaluate performance of random forest on training set PredictMD.singlelabelregressionmetrics( randomforestreg, - trainingfeaturesdf, - traininglabelsdf, + training_features_df, + traininglabels_df, labelname, ) # Evaluate performance of random forest on testing set PredictMD.singlelabelregressionmetrics( randomforestreg, - testingfeaturesdf, - testinglabelsdf, + testing_features_df, + testing_labels_df, labelname, ) @@ -135,9 +135,9 @@ end # output by each of regression models. # Get real-valued predictions from each model for training set -PredictMD.predict(randomforestreg,trainingfeaturesdf) +PredictMD.predict(randomforestreg,training_features_df) # Get real-valued predictions from each model for testing set -PredictMD.predict(randomforestreg,testingfeaturesdf) +PredictMD.predict(randomforestreg,testing_features_df) diff --git a/examples/boston_housing/boston_housing_svm.ipynb b/examples/boston_housing/boston_housing_svm.ipynb index 213354dce..d3de3f5a2 100644 --- a/examples/boston_housing/boston_housing_svm.ipynb +++ b/examples/boston_housing/boston_housing_svm.ipynb @@ -301,12 +301,12 @@ "labelname = :MedV\n", "\n", "# Put features and labels in separate dataframes\n", - "featuresdf = df[featurenames]\n", - "labelsdf = df[[labelname]]\n", + "features_df = df[featurenames]\n", + "labels_df = df[[labelname]]\n", "\n", "# Display for exploration\n", - "display(DataFrames.head(featuresdf))\n", - "display(DataFrames.head(labelsdf))" + "display(DataFrames.head(features_df))\n", + "display(DataFrames.head(labels_df))" ] }, { @@ -334,7 +334,7 @@ ], "source": [ "# View summary statistics for label variable (mean, quartiles, etc.)\n", - "DataFrames.describe(labelsdf[labelname])" + "DataFrames.describe(labels_df[labelname])" ] }, { @@ -344,8 +344,8 @@ "outputs": [], "source": [ "# Split data into training set (70%) and testing set (30%)\n", - "trainingfeaturesdf,testingfeaturesdf,traininglabelsdf,testinglabelsdf =\n", - " PredictMD.split_data(featuresdf,labelsdf;training = 0.7,testing = 0.3,);" + "training_features_df,testing_features_df,traininglabels_df,testing_labels_df =\n", + " PredictMD.split_data(features_df,labels_df;training = 0.7,testing = 0.3,);" ] }, { @@ -405,7 +405,7 @@ " # set feature contrasts\n", " PredictMD.set_feature_contrasts!(epsilonsvr_svmreg, contrasts)\n", " # Train epsilon-SVR model on training set\n", - " PredictMD.fit!(epsilonsvr_svmreg,trainingfeaturesdf,traininglabelsdf,)\n", + " PredictMD.fit!(epsilonsvr_svmreg,training_features_df,traininglabels_df,)\n", "end" ] }, @@ -950,8 +950,8 @@ "# Plot true values versus predicted values for epsilon-SVR on training set\n", "epsilonsvr_svmreg_plot_training = PredictMD.plotsinglelabelregressiontrueversuspredicted(\n", " epsilonsvr_svmreg,\n", - " trainingfeaturesdf,\n", - " traininglabelsdf,\n", + " training_features_df,\n", + " traininglabels_df,\n", " labelname,\n", " )" ] @@ -1295,8 +1295,8 @@ "# Plot true values versus predicted values for epsilon-SVR on testing set\n", "epsilonsvr_svmreg_plot_testing = PredictMD.plotsinglelabelregressiontrueversuspredicted(\n", " epsilonsvr_svmreg,\n", - " testingfeaturesdf,\n", - " testinglabelsdf,\n", + " testing_features_df,\n", + " testing_labels_df,\n", " labelname,\n", " )" ] @@ -1327,8 +1327,8 @@ "# Evaluate performance of epsilon-SVR on training set\n", "PredictMD.singlelabelregressionmetrics(\n", " epsilonsvr_svmreg,\n", - " trainingfeaturesdf,\n", - " traininglabelsdf,\n", + " training_features_df,\n", + " traininglabels_df,\n", " labelname,\n", " )" ] @@ -1359,8 +1359,8 @@ "# Evaluate performance of epsilon-SVR on testing set\n", "PredictMD.singlelabelregressionmetrics(\n", " epsilonsvr_svmreg,\n", - " testingfeaturesdf,\n", - " testinglabelsdf,\n", + " testing_features_df,\n", + " testing_labels_df,\n", " labelname,\n", " )" ] @@ -1415,7 +1415,7 @@ " # set feature contrasts\n", " PredictMD.set_feature_contrasts!(nusvr_svmreg, contrasts)\n", " # Train nu-SVR model\n", - " PredictMD.fit!(nusvr_svmreg,trainingfeaturesdf,traininglabelsdf,)\n", + " PredictMD.fit!(nusvr_svmreg,training_features_df,traininglabels_df,)\n", "end" ] }, @@ -1960,8 +1960,8 @@ "# Plot true values versus predicted values for nu-SVR on training set\n", "nusvr_svmreg_plot_training = PredictMD.plotsinglelabelregressiontrueversuspredicted(\n", " nusvr_svmreg,\n", - " trainingfeaturesdf,\n", - " traininglabelsdf,\n", + " training_features_df,\n", + " traininglabels_df,\n", " labelname,\n", " )" ] @@ -2305,8 +2305,8 @@ "# Plot true values versus predicted values for nu-SVR on testing set\n", "nusvr_svmreg_plot_testing = PredictMD.plotsinglelabelregressiontrueversuspredicted(\n", " nusvr_svmreg,\n", - " testingfeaturesdf,\n", - " testinglabelsdf,\n", + " testing_features_df,\n", + " testing_labels_df,\n", " labelname,\n", " )" ] @@ -2337,8 +2337,8 @@ "# Evaluate performance of nu-SVR on training set\n", "PredictMD.singlelabelregressionmetrics(\n", " nusvr_svmreg,\n", - " trainingfeaturesdf,\n", - " traininglabelsdf,\n", + " training_features_df,\n", + " traininglabels_df,\n", " labelname,\n", " )" ] @@ -2369,8 +2369,8 @@ "# Evaluate performance of nu-SVR on testing set\n", "PredictMD.singlelabelregressionmetrics(\n", " nusvr_svmreg,\n", - " testingfeaturesdf,\n", - " testinglabelsdf,\n", + " testing_features_df,\n", + " testing_labels_df,\n", " labelname,\n", " )" ] @@ -2461,12 +2461,12 @@ "# output by each of regression models.\n", "\n", "# Get real-valued predictions from each model for training set\n", - "PredictMD.predict(epsilonsvr_svmreg,trainingfeaturesdf)\n", - "PredictMD.predict(nusvr_svmreg,trainingfeaturesdf)\n", + "PredictMD.predict(epsilonsvr_svmreg,training_features_df)\n", + "PredictMD.predict(nusvr_svmreg,training_features_df)\n", "\n", "# Get real-valued predictions from each model for testing set\n", - "PredictMD.predict(epsilonsvr_svmreg,testingfeaturesdf)\n", - "PredictMD.predict(nusvr_svmreg,testingfeaturesdf)" + "PredictMD.predict(epsilonsvr_svmreg,testing_features_df)\n", + "PredictMD.predict(nusvr_svmreg,testing_features_df)" ] } ], diff --git a/examples/boston_housing/boston_housing_svm.jl b/examples/boston_housing/boston_housing_svm.jl index b527a3eaf..fcf9b1633 100644 --- a/examples/boston_housing/boston_housing_svm.jl +++ b/examples/boston_housing/boston_housing_svm.jl @@ -65,19 +65,19 @@ end labelname = :MedV # Put features and labels in separate dataframes -featuresdf = df[featurenames] -labelsdf = df[[labelname]] +features_df = df[featurenames] +labels_df = df[[labelname]] # Display for exploration -display(DataFrames.head(featuresdf)) -display(DataFrames.head(labelsdf)) +display(DataFrames.head(features_df)) +display(DataFrames.head(labels_df)) # View summary statistics for label variable (mean, quartiles, etc.) -DataFrames.describe(labelsdf[labelname]) +DataFrames.describe(labels_df[labelname]) # Split data into training set (70%) and testing set (30%) -trainingfeaturesdf,testingfeaturesdf,traininglabelsdf,testinglabelsdf = - PredictMD.split_data(featuresdf,labelsdf,0.7); +training_features_df,testing_features_df,traininglabels_df,testing_labels_df = + PredictMD.split_data(features_df,labels_df,0.7); # Set up epsilon-SVR model epsilonsvr_svmreg = PredictMD.singlelabeldataframesvmregression( @@ -96,38 +96,38 @@ else # set feature contrasts PredictMD.set_feature_contrasts!(epsilonsvr_svmreg , feature_contrasts) # Train epsilon-SVR model on training set - PredictMD.fit!(epsilonsvr_svmreg,trainingfeaturesdf,traininglabelsdf,) + PredictMD.fit!(epsilonsvr_svmreg,training_features_df,traininglabels_df,) end # Plot true values versus predicted values for epsilon-SVR on training set epsilonsvr_svmreg_plot_training = PredictMD.plotsinglelabelregressiontrueversuspredicted( epsilonsvr_svmreg, - trainingfeaturesdf, - traininglabelsdf, + training_features_df, + traininglabels_df, labelname, ) # Plot true values versus predicted values for epsilon-SVR on testing set epsilonsvr_svmreg_plot_testing = PredictMD.plotsinglelabelregressiontrueversuspredicted( epsilonsvr_svmreg, - testingfeaturesdf, - testinglabelsdf, + testing_features_df, + testing_labels_df, labelname, ) # Evaluate performance of epsilon-SVR on training set PredictMD.singlelabelregressionmetrics( epsilonsvr_svmreg, - trainingfeaturesdf, - traininglabelsdf, + training_features_df, + traininglabels_df, labelname, ) # Evaluate performance of epsilon-SVR on testing set PredictMD.singlelabelregressionmetrics( epsilonsvr_svmreg, - testingfeaturesdf, - testinglabelsdf, + testing_features_df, + testing_labels_df, labelname, ) @@ -148,38 +148,38 @@ else # set feature contrasts PredictMD.set_feature_contrasts!(nusvr_svmreg , feature_contrasts) # Train nu-SVR model - PredictMD.fit!(nusvr_svmreg,trainingfeaturesdf,traininglabelsdf,) + PredictMD.fit!(nusvr_svmreg,training_features_df,traininglabels_df,) end # Plot true values versus predicted values for nu-SVR on training set nusvr_svmreg_plot_training = PredictMD.plotsinglelabelregressiontrueversuspredicted( nusvr_svmreg, - trainingfeaturesdf, - traininglabelsdf, + training_features_df, + traininglabels_df, labelname, ) # Plot true values versus predicted values for nu-SVR on testing set nusvr_svmreg_plot_testing = PredictMD.plotsinglelabelregressiontrueversuspredicted( nusvr_svmreg, - testingfeaturesdf, - testinglabelsdf, + testing_features_df, + testing_labels_df, labelname, ) # Evaluate performance of nu-SVR on training set PredictMD.singlelabelregressionmetrics( nusvr_svmreg, - trainingfeaturesdf, - traininglabelsdf, + training_features_df, + traininglabels_df, labelname, ) # Evaluate performance of nu-SVR on testing set PredictMD.singlelabelregressionmetrics( nusvr_svmreg, - testingfeaturesdf, - testinglabelsdf, + testing_features_df, + testing_labels_df, labelname, ) @@ -192,9 +192,9 @@ end # output by each of regression models. # Get real-valued predictions from each model for training set -PredictMD.predict(epsilonsvr_svmreg,trainingfeaturesdf) -PredictMD.predict(nusvr_svmreg,trainingfeaturesdf) +PredictMD.predict(epsilonsvr_svmreg,training_features_df) +PredictMD.predict(nusvr_svmreg,training_features_df) # Get real-valued predictions from each model for testing set -PredictMD.predict(epsilonsvr_svmreg,testingfeaturesdf) -PredictMD.predict(nusvr_svmreg,testingfeaturesdf) +PredictMD.predict(epsilonsvr_svmreg,testing_features_df) +PredictMD.predict(nusvr_svmreg,testing_features_df) diff --git a/examples/breast_cancer_biopsy.jl b/examples/breast_cancer_biopsy.jl index 17b463c72..f03784592 100644 --- a/examples/breast_cancer_biopsy.jl +++ b/examples/breast_cancer_biopsy.jl @@ -97,12 +97,12 @@ positiveclass = "malignant" labellevels = [negativeclass, positiveclass] # Put features and labels in separate dataframes -featuresdf = df[featurenames] -labelsdf = df[[labelname]] +features_df = df[featurenames] +labels_df = df[[labelname]] # Split data into training set (70%) and testing set (30%) -trainingfeaturesdf,testingfeaturesdf,traininglabelsdf,testinglabelsdf = - PredictMD.split_data(featuresdf,labelsdf,0.7) +training_features_df,testing_features_df,traininglabels_df,testing_labels_df = + PredictMD.split_data(features_df,labels_df,0.7) ############################################################################## ############################################################################## @@ -111,8 +111,8 @@ trainingfeaturesdf,testingfeaturesdf,traininglabelsdf,testinglabelsdf = ############################################################################## # Examine prevalence of each class in training set -DataFrames.describe(traininglabelsdf[labelname]) -StatsBase.countmap(traininglabelsdf[labelname]) +DataFrames.describe(traininglabels_df[labelname]) +StatsBase.countmap(traininglabels_df[labelname]) # We see that malignant is minority class and benign is majority class. # The ratio of malignant:benign is somewhere between 1:2.5 and 1:3 (depending @@ -122,9 +122,9 @@ StatsBase.countmap(traininglabelsdf[labelname]) majorityclass = "benign" minorityclass = "malignant" -smotedtrainingfeaturesdf, smotedtraininglabelsdf = PredictMD.smote( - trainingfeaturesdf, - traininglabelsdf, +smotedtraining_features_df, smotedtraininglabels_df = PredictMD.smote( + training_features_df, + traininglabels_df, featurenames, labelname; majorityclass = majorityclass, @@ -135,8 +135,8 @@ smotedtrainingfeaturesdf, smotedtraininglabelsdf = PredictMD.smote( ) # Examine prevalence of each class in smoted training set -DataFrames.describe(smotedtraininglabelsdf[labelname]) -StatsBase.countmap(smotedtraininglabelsdf[labelname]) +DataFrames.describe(smotedtraininglabels_df[labelname]) +StatsBase.countmap(smotedtraininglabels_df[labelname]) # Now we have a ratio of malignant:benign that is 1:1. @@ -168,8 +168,8 @@ else # Train logistic classifier model on smoted training set PredictMD.fit!( logisticclassifier, - smotedtrainingfeaturesdf, - smotedtraininglabelsdf, + smotedtraining_features_df, + smotedtraininglabels_df, ) end @@ -179,8 +179,8 @@ PredictMD.get_underlying(logisticclassifier) # Plot classifier histogram for logistic classifier on smoted training set logistic_hist_training = PredictMD.plotsinglelabelbinaryclassclassifierhistogram( logisticclassifier, - smotedtrainingfeaturesdf, - smotedtraininglabelsdf, + smotedtraining_features_df, + smotedtraininglabels_df, labelname, labellevels, ) @@ -189,8 +189,8 @@ PredictMD.open(logistic_hist_training) # Plot classifier histogram for logistic classifier on testing set logistic_hist_testing = PredictMD.plotsinglelabelbinaryclassclassifierhistogram( logisticclassifier, - testingfeaturesdf, - testinglabelsdf, + testing_features_df, + testing_labels_df, labelname, labellevels, ) @@ -199,8 +199,8 @@ PredictMD.open(logistic_hist_testing) # Evaluate performance of logistic classifier on smoted training set PredictMD.singlelabelbinaryclassclassificationmetrics( logisticclassifier, - testingfeaturesdf, - testinglabelsdf, + testing_features_df, + testing_labels_df, labelname, positiveclass; sensitivity = 0.95, @@ -209,8 +209,8 @@ PredictMD.singlelabelbinaryclassclassificationmetrics( # Evaluate performance of logistic classifier on testing set PredictMD.singlelabelbinaryclassclassificationmetrics( logisticclassifier, - testingfeaturesdf, - testinglabelsdf, + testing_features_df, + testing_labels_df, labelname, positiveclass; sensitivity = 0.95, @@ -238,8 +238,8 @@ else # Train probit classifier model on smoted training set PredictMD.fit!( probitclassifier, - smotedtrainingfeaturesdf, - smotedtraininglabelsdf, + smotedtraining_features_df, + smotedtraininglabels_df, ) end @@ -249,8 +249,8 @@ PredictMD.get_underlying(probitclassifier) # Plot classifier histogram for probit classifier on smoted training set probitclassifier_hist_training = PredictMD.plotsinglelabelbinaryclassclassifierhistogram( probitclassifier, - smotedtrainingfeaturesdf, - smotedtraininglabelsdf, + smotedtraining_features_df, + smotedtraininglabels_df, labelname, labellevels, ) @@ -259,8 +259,8 @@ PredictMD.open(probitclassifier_hist_training) # Plot classifier histogram for probit classifier on testing set probitclassifier_hist_testing = PredictMD.plotsinglelabelbinaryclassclassifierhistogram( probitclassifier, - testingfeaturesdf, - testinglabelsdf, + testing_features_df, + testing_labels_df, labelname, labellevels, ) @@ -269,8 +269,8 @@ PredictMD.open(probitclassifier_hist_testing) # Evaluate performance of probit classifier on smoted training set PredictMD.singlelabelbinaryclassclassificationmetrics( probitclassifier, - smotedtrainingfeaturesdf, - smotedtraininglabelsdf, + smotedtraining_features_df, + smotedtraininglabels_df, labelname, positiveclass; sensitivity = 0.95, @@ -279,8 +279,8 @@ PredictMD.singlelabelbinaryclassclassificationmetrics( # Evaluate performance of probit classifier on testing set PredictMD.singlelabelbinaryclassclassificationmetrics( probitclassifier, - testingfeaturesdf, - testinglabelsdf, + testing_features_df, + testing_labels_df, labelname, positiveclass; sensitivity = 0.95, @@ -309,16 +309,16 @@ else # Train random forest classifier model on smoted training set PredictMD.fit!( rfclassifier, - smotedtrainingfeaturesdf, - smotedtraininglabelsdf, + smotedtraining_features_df, + smotedtraininglabels_df, ) end # Plot classifier histogram for random forest classifier on smoted training set rfclassifier_hist_training = PredictMD.plotsinglelabelbinaryclassclassifierhistogram( rfclassifier, - smotedtrainingfeaturesdf, - smotedtraininglabelsdf, + smotedtraining_features_df, + smotedtraininglabels_df, labelname, labellevels, ) @@ -327,8 +327,8 @@ PredictMD.open(rfclassifier_hist_training) # Plot classifier histogram for random forest classifier on testing set rfclassifier_hist_testing = PredictMD.plotsinglelabelbinaryclassclassifierhistogram( rfclassifier, - testingfeaturesdf, - testinglabelsdf, + testing_features_df, + testing_labels_df, labelname, labellevels, ) @@ -337,8 +337,8 @@ PredictMD.open(rfclassifier_hist_testing) # Evaluate performance of random forest classifier on smoted training set PredictMD.singlelabelbinaryclassclassificationmetrics( rfclassifier, - smotedtrainingfeaturesdf, - smotedtraininglabelsdf, + smotedtraining_features_df, + smotedtraininglabels_df, labelname, positiveclass; sensitivity = 0.95, @@ -347,8 +347,8 @@ PredictMD.singlelabelbinaryclassclassificationmetrics( # Evaluate performance of random forest on testing set PredictMD.singlelabelbinaryclassclassificationmetrics( rfclassifier, - testingfeaturesdf, - testinglabelsdf, + testing_features_df, + testing_labels_df, labelname, positiveclass; sensitivity = 0.95, @@ -377,16 +377,16 @@ else # Train C-SVC model on smoted training set PredictMD.fit!( csvc_svmclassifier, - smotedtrainingfeaturesdf, - smotedtraininglabelsdf, + smotedtraining_features_df, + smotedtraininglabels_df, ) end # Plot classifier histogram for C-SVC on smoted training set csvc_svmclassifier_hist_training = PredictMD.plotsinglelabelbinaryclassclassifierhistogram( csvc_svmclassifier, - smotedtrainingfeaturesdf, - smotedtraininglabelsdf, + smotedtraining_features_df, + smotedtraininglabels_df, labelname, labellevels, ) @@ -395,8 +395,8 @@ PredictMD.open(csvc_svmclassifier_hist_training) # Plot classifier histogram for C-SVC on testing set csvc_svmclassifier_hist_testing = PredictMD.plotsinglelabelbinaryclassclassifierhistogram( csvc_svmclassifier, - testingfeaturesdf, - testinglabelsdf, + testing_features_df, + testing_labels_df, labelname, labellevels, ) @@ -405,8 +405,8 @@ PredictMD.open(csvc_svmclassifier_hist_testing) # Evaluate performance of C-SVC on smoted training set PredictMD.singlelabelbinaryclassclassificationmetrics( csvc_svmclassifier, - smotedtrainingfeaturesdf, - smotedtraininglabelsdf, + smotedtraining_features_df, + smotedtraininglabels_df, labelname, positiveclass; sensitivity = 0.95, @@ -415,8 +415,8 @@ PredictMD.singlelabelbinaryclassclassificationmetrics( # Evaluate performance of C-SVC on testing set PredictMD.singlelabelbinaryclassclassificationmetrics( csvc_svmclassifier, - testingfeaturesdf, - testinglabelsdf, + testing_features_df, + testing_labels_df, labelname, positiveclass; sensitivity = 0.95, @@ -445,16 +445,16 @@ else # Train nu-SVC model on smoted training set PredictMD.fit!( nusvc_svmclassifier, - smotedtrainingfeaturesdf, - smotedtraininglabelsdf, + smotedtraining_features_df, + smotedtraininglabels_df, ) end # Plot classifier histogram for nu-SVC on smoted training set nusvc_svmclassifier_hist_training = PredictMD.plotsinglelabelbinaryclassclassifierhistogram( nusvc_svmclassifier, - smotedtrainingfeaturesdf, - smotedtraininglabelsdf, + smotedtraining_features_df, + smotedtraininglabels_df, labelname, labellevels, ) @@ -463,8 +463,8 @@ PredictMD.open(nusvc_svmclassifier_hist_training) # Plot classifier histogram for nu-SVC on testing set nusvc_svmclassifier_hist_testing = PredictMD.plotsinglelabelbinaryclassclassifierhistogram( nusvc_svmclassifier, - testingfeaturesdf, - testinglabelsdf, + testing_features_df, + testing_labels_df, labelname, labellevels, ) @@ -473,8 +473,8 @@ PredictMD.open(nusvc_svmclassifier_hist_testing) # Evaluate performance of nu-SVC on smoted training set PredictMD.singlelabelbinaryclassclassificationmetrics( nusvc_svmclassifier, - smotedtrainingfeaturesdf, - smotedtraininglabelsdf, + smotedtraining_features_df, + smotedtraininglabels_df, labelname, positiveclass; sensitivity = 0.95, @@ -483,8 +483,8 @@ PredictMD.singlelabelbinaryclassclassificationmetrics( # Evaluate performance of SVM on testing set PredictMD.singlelabelbinaryclassclassificationmetrics( nusvc_svmclassifier, - testingfeaturesdf, - testinglabelsdf, + testing_features_df, + testing_labels_df, labelname, positiveclass; sensitivity = 0.95, @@ -620,8 +620,8 @@ else # Train multilayer perceptron model on training set PredictMD.fit!( knetmlpclassifier, - smotedtrainingfeaturesdf, - smotedtraininglabelsdf, + smotedtraining_features_df, + smotedtraininglabels_df, ) end @@ -664,8 +664,8 @@ PredictMD.open(knet_learningcurve_lossvsiteration_skip100iterations) # Plot classifier histogram for multilayer perceptron on smoted training set knetmlpclassifier_hist_training = PredictMD.plotsinglelabelbinaryclassclassifierhistogram( knetmlpclassifier, - smotedtrainingfeaturesdf, - smotedtraininglabelsdf, + smotedtraining_features_df, + smotedtraininglabels_df, labelname, labellevels, ) @@ -674,8 +674,8 @@ PredictMD.open(knetmlpclassifier_hist_training) # Plot classifier histogram for multilayer perceptron on testing set knetmlpclassifier_hist_testing = PredictMD.plotsinglelabelbinaryclassclassifierhistogram( knetmlpclassifier, - testingfeaturesdf, - testinglabelsdf, + testing_features_df, + testing_labels_df, labelname, labellevels, ) @@ -684,8 +684,8 @@ PredictMD.open(knetmlpclassifier_hist_testing) # Evaluate performance of multilayer perceptron on smoted training set PredictMD.singlelabelbinaryclassclassificationmetrics( knetmlpclassifier, - smotedtrainingfeaturesdf, - smotedtraininglabelsdf, + smotedtraining_features_df, + smotedtraininglabels_df, labelname, positiveclass; sensitivity = 0.95, @@ -694,8 +694,8 @@ PredictMD.singlelabelbinaryclassclassificationmetrics( # Evaluate performance of multilayer perceptron on testing set PredictMD.singlelabelbinaryclassclassificationmetrics( knetmlpclassifier, - testingfeaturesdf, - testinglabelsdf, + testing_features_df, + testing_labels_df, labelname, positiveclass; sensitivity = 0.95, @@ -717,8 +717,8 @@ showall(PredictMD.singlelabelbinaryclassclassificationmetrics( nusvc_svmclassifier, knetmlpclassifier, ], - trainingfeaturesdf, - traininglabelsdf, + training_features_df, + traininglabels_df, labelname, positiveclass; sensitivity = 0.95, @@ -732,8 +732,8 @@ showall(PredictMD.singlelabelbinaryclassclassificationmetrics( nusvc_svmclassifier, knetmlpclassifier, ], - trainingfeaturesdf, - traininglabelsdf, + training_features_df, + traininglabels_df, labelname, positiveclass; specificity = 0.95, @@ -747,8 +747,8 @@ showall(PredictMD.singlelabelbinaryclassclassificationmetrics( nusvc_svmclassifier, knetmlpclassifier, ], - trainingfeaturesdf, - traininglabelsdf, + training_features_df, + traininglabels_df, labelname, positiveclass; maximize = :f1score, @@ -762,8 +762,8 @@ showall(PredictMD.singlelabelbinaryclassclassificationmetrics( nusvc_svmclassifier, knetmlpclassifier, ], - trainingfeaturesdf, - traininglabelsdf, + training_features_df, + traininglabels_df, labelname, positiveclass; maximize = :cohen_kappa, @@ -779,8 +779,8 @@ showall(PredictMD.singlelabelbinaryclassclassificationmetrics( nusvc_svmclassifier, knetmlpclassifier, ], - testingfeaturesdf, - testinglabelsdf, + testing_features_df, + testing_labels_df, labelname, positiveclass; sensitivity = 0.95, @@ -794,8 +794,8 @@ showall(PredictMD.singlelabelbinaryclassclassificationmetrics( nusvc_svmclassifier, knetmlpclassifier, ], - testingfeaturesdf, - testinglabelsdf, + testing_features_df, + testing_labels_df, labelname, positiveclass; specificity = 0.95, @@ -809,8 +809,8 @@ showall(PredictMD.singlelabelbinaryclassclassificationmetrics( nusvc_svmclassifier, knetmlpclassifier, ], - testingfeaturesdf, - testinglabelsdf, + testing_features_df, + testing_labels_df, labelname, positiveclass; maximize = :f1score, @@ -824,8 +824,8 @@ showall(PredictMD.singlelabelbinaryclassclassificationmetrics( nusvc_svmclassifier, knetmlpclassifier, ], - testingfeaturesdf, - testinglabelsdf, + testing_features_df, + testing_labels_df, labelname, positiveclass; maximize = :cohen_kappa, @@ -841,8 +841,8 @@ rocplottesting = PredictMD.plotroccurves( nusvc_svmclassifier, knetmlpclassifier, ], - testingfeaturesdf, - testinglabelsdf, + testing_features_df, + testing_labels_df, labelname, positiveclass, ) @@ -858,8 +858,8 @@ prplottesting = PredictMD.plotprcurves( nusvc_svmclassifier, knetmlpclassifier, ], - testingfeaturesdf, - testinglabelsdf, + testing_features_df, + testing_labels_df, labelname, positiveclass, ) @@ -890,20 +890,20 @@ end # by each of the classification models. # Get probabilities from each model for smoted training set -PredictMD.predict_proba(logisticclassifier,smotedtrainingfeaturesdf,) -PredictMD.predict_proba(probitclassifier,smotedtrainingfeaturesdf,) -PredictMD.predict_proba(rfclassifier,smotedtrainingfeaturesdf,) -PredictMD.predict_proba(csvc_svmclassifier,smotedtrainingfeaturesdf,) -PredictMD.predict_proba(nusvc_svmclassifier,smotedtrainingfeaturesdf,) -PredictMD.predict_proba(knetmlpclassifier,smotedtrainingfeaturesdf,) +PredictMD.predict_proba(logisticclassifier,smotedtraining_features_df,) +PredictMD.predict_proba(probitclassifier,smotedtraining_features_df,) +PredictMD.predict_proba(rfclassifier,smotedtraining_features_df,) +PredictMD.predict_proba(csvc_svmclassifier,smotedtraining_features_df,) +PredictMD.predict_proba(nusvc_svmclassifier,smotedtraining_features_df,) +PredictMD.predict_proba(knetmlpclassifier,smotedtraining_features_df,) # Get probabilities from each model for testing set -PredictMD.predict_proba(logisticclassifier,testingfeaturesdf,) -PredictMD.predict_proba(probitclassifier,testingfeaturesdf,) -PredictMD.predict_proba(rfclassifier,testingfeaturesdf,) -PredictMD.predict_proba(csvc_svmclassifier,testingfeaturesdf,) -PredictMD.predict_proba(nusvc_svmclassifier,testingfeaturesdf,) -PredictMD.predict_proba(knetmlpclassifier,testingfeaturesdf,) +PredictMD.predict_proba(logisticclassifier,testing_features_df,) +PredictMD.predict_proba(probitclassifier,testing_features_df,) +PredictMD.predict_proba(rfclassifier,testing_features_df,) +PredictMD.predict_proba(csvc_svmclassifier,testing_features_df,) +PredictMD.predict_proba(nusvc_svmclassifier,testing_features_df,) +PredictMD.predict_proba(knetmlpclassifier,testing_features_df,) # If we want to get predicted classes instead of probabilities, we can use the # PredictMD.predict() function to get the class predictions output by each of the @@ -912,17 +912,17 @@ PredictMD.predict_proba(knetmlpclassifier,testingfeaturesdf,) # equivalent to using a threshold of 0.5. # Get class predictions from each model for smoted training set -PredictMD.predict(logisticclassifier,smotedtrainingfeaturesdf,) -PredictMD.predict(probitclassifier,smotedtrainingfeaturesdf,) -PredictMD.predict(rfclassifier,smotedtrainingfeaturesdf,) -PredictMD.predict(csvc_svmclassifier,smotedtrainingfeaturesdf,) -PredictMD.predict(nusvc_svmclassifier,smotedtrainingfeaturesdf,) -PredictMD.predict(knetmlpclassifier,smotedtrainingfeaturesdf,) +PredictMD.predict(logisticclassifier,smotedtraining_features_df,) +PredictMD.predict(probitclassifier,smotedtraining_features_df,) +PredictMD.predict(rfclassifier,smotedtraining_features_df,) +PredictMD.predict(csvc_svmclassifier,smotedtraining_features_df,) +PredictMD.predict(nusvc_svmclassifier,smotedtraining_features_df,) +PredictMD.predict(knetmlpclassifier,smotedtraining_features_df,) # Get class predictions from each model for testing set -PredictMD.predict(logisticclassifier,testingfeaturesdf,) -PredictMD.predict(probitclassifier,testingfeaturesdf,) -PredictMD.predict(rfclassifier,testingfeaturesdf,) -PredictMD.predict(csvc_svmclassifier,testingfeaturesdf,) -PredictMD.predict(nusvc_svmclassifier,testingfeaturesdf,) -PredictMD.predict(knetmlpclassifier,testingfeaturesdf,) +PredictMD.predict(logisticclassifier,testing_features_df,) +PredictMD.predict(probitclassifier,testing_features_df,) +PredictMD.predict(rfclassifier,testing_features_df,) +PredictMD.predict(csvc_svmclassifier,testing_features_df,) +PredictMD.predict(nusvc_svmclassifier,testing_features_df,) +PredictMD.predict(knetmlpclassifier,testing_features_df,) diff --git a/src/classimbalance/smote.jl b/src/classimbalance/smote.jl index f8634c955..41a6020cd 100644 --- a/src/classimbalance/smote.jl +++ b/src/classimbalance/smote.jl @@ -17,8 +17,8 @@ function calculate_smote_pct_under( end function smote( - featuresdf::DataFrames.AbstractDataFrame, - labelsdf::DataFrames.AbstractDataFrame, + features_df::DataFrames.AbstractDataFrame, + labels_df::DataFrames.AbstractDataFrame, featurenames::AbstractVector{Symbol}, labelname::Symbol; majorityclass::AbstractString = "", @@ -29,8 +29,8 @@ function smote( ) result = smote( Base.GLOBAL_RNG, - featuresdf, - labelsdf, + features_df, + labels_df, featurenames, labelname, majorityclass = majorityclass, @@ -44,8 +44,8 @@ end function smote( rng::AbstractRNG, - featuresdf::DataFrames.AbstractDataFrame, - labelsdf::DataFrames.AbstractDataFrame, + features_df::DataFrames.AbstractDataFrame, + labels_df::DataFrames.AbstractDataFrame, featurenames::AbstractVector{Symbol}, labelname::Symbol; majorityclass::AbstractString = "", @@ -65,13 +65,13 @@ function smote( pct_over = pct_over, minority_to_majority_ratio = minority_to_majority_ratio, ) - if size(featuresdf, 1) != size(labelsdf, 1) - error("size(featuresdf, 1) != size(labelsdf, 1)") + if size(features_df, 1) != size(labels_df, 1) + error("size(features_df, 1) != size(labels_df, 1)") end - if size(featuresdf, 1) == 0 - error("size(featuresdf, 1) == 0") + if size(features_df, 1) == 0 + error("size(features_df, 1) == 0") end - labelsstringarray = labelsdf[labelname] + labelsstringarray = labels_df[labelname] labelsbinaryarray = zeros(Int, length(labelsstringarray)) for i = 1:length(labelsstringarray) # Paul's smote code assumes 1 = minority, 0 = majority @@ -83,8 +83,8 @@ function smote( error("value in labels column is neither majority nor minority") end end - smotedfeaturesdf, smotedlabelsbinaryarray = ClassImbalance.smote( - featuresdf[featurenames], + smotedfeatures_df, smotedlabelsbinaryarray = ClassImbalance.smote( + features_df[featurenames], labelsbinaryarray; k = k, pct_over = pct_over, @@ -100,7 +100,7 @@ function smote( error("if you see this error, you will be very sad.") end end - smotedlabelsdf = DataFrames.DataFrame() - smotedlabelsdf[labelname] = smotedlabelsstringarray - return smotedfeaturesdf, smotedlabelsdf + smotedlabels_df = DataFrames.DataFrame() + smotedlabels_df[labelname] = smotedlabelsstringarray + return smotedfeatures_df, smotedlabels_df end diff --git a/src/linearmodel/glm.jl b/src/linearmodel/glm.jl index 8dd877118..b759b51f3 100644 --- a/src/linearmodel/glm.jl +++ b/src/linearmodel/glm.jl @@ -62,15 +62,15 @@ end function fit!( estimator::GLMModel, - featuresdf::DataFrames.AbstractDataFrame, - labelsdf::DataFrames.AbstractDataFrame, + features_df::DataFrames.AbstractDataFrame, + labels_df::DataFrames.AbstractDataFrame, ) - labelsandfeaturesdf = hcat(labelsdf, featuresdf) + labelsandfeatures_df = hcat(labels_df, features_df) info(string("INFO Starting to train GLM.jl model.")) glm = try GLM.glm( estimator.formula, - labelsandfeaturesdf, + labelsandfeatures_df, estimator.family, estimator.link, ) @@ -91,12 +91,12 @@ end function predict( estimator::GLMModel, - featuresdf::DataFrames.AbstractDataFrame, + features_df::DataFrames.AbstractDataFrame, ) if estimator.isclassificationmodel && !estimator.isregressionmodel probabilitiesassoc = predict_proba( estimator, - featuresdf, + features_df, ) predictionsvector = singlelabelprobabilitiestopredictions( probabilitiesassoc @@ -107,11 +107,11 @@ function predict( return result elseif !estimator.isclassificationmodel && estimator.isregressionmodel if is_nothing(estimator.underlyingglm) - glmpredictoutput = zeros(size(featuresdf,1)) + glmpredictoutput = zeros(size(features_df,1)) else glmpredictoutput = GLM.predict( estimator.underlyingglm, - featuresdf, + features_df, ) end result = DataFrames.DataFrame() @@ -125,15 +125,15 @@ end function predict_proba( estimator::GLMModel, - featuresdf::DataFrames.AbstractDataFrame, + features_df::DataFrames.AbstractDataFrame, ) if estimator.isclassificationmodel && !estimator.isregressionmodel if is_nothing(estimator.underlyingglm,) - glmpredictoutput = zeros(size(featuresdf, 1)) + glmpredictoutput = zeros(size(features_df, 1)) else glmpredictoutput = GLM.predict( estimator.underlyingglm, - featuresdf, + features_df, ) end result = Dict() diff --git a/src/metrics/risk_score_cutoff_values.jl b/src/metrics/risk_score_cutoff_values.jl index 14db8d62a..2074f4799 100644 --- a/src/metrics/risk_score_cutoff_values.jl +++ b/src/metrics/risk_score_cutoff_values.jl @@ -2,8 +2,8 @@ import DataFrames function risk_score_cutoff_values( estimator::Fittable, - featuresdf::DataFrames.AbstractDataFrame, - labelsdf::DataFrames.AbstractDataFrame, + features_df::DataFrames.AbstractDataFrame, + labels_df::DataFrames.AbstractDataFrame, singlelabelname::Symbol, positiveclass::AbstractString; multiply_by::Real = 1.0, @@ -12,12 +12,12 @@ function risk_score_cutoff_values( # ytrue = Int.( singlelabelbinaryytrue( - labelsdf[singlelabelname], + labels_df[singlelabelname], positiveclass, ) ) # - predictedprobabilitiesalllabels = predict_proba(estimator, featuresdf) + predictedprobabilitiesalllabels = predict_proba(estimator, features_df) yscore = Cfloat.( singlelabelbinaryyscore( predictedprobabilitiesalllabels[singlelabelname], diff --git a/src/metrics/singlelabelbinaryclassificationmetrics.jl b/src/metrics/singlelabelbinaryclassificationmetrics.jl index 247e5534e..bb3fefe5e 100644 --- a/src/metrics/singlelabelbinaryclassificationmetrics.jl +++ b/src/metrics/singlelabelbinaryclassificationmetrics.jl @@ -104,8 +104,8 @@ end function _singlelabelbinaryclassclassificationmetrics( estimator::Fittable, - featuresdf::DataFrames.AbstractDataFrame, - labelsdf::DataFrames.AbstractDataFrame, + features_df::DataFrames.AbstractDataFrame, + labels_df::DataFrames.AbstractDataFrame, singlelabelname::Symbol, positiveclass::AbstractString; kwargs... @@ -116,7 +116,7 @@ function _singlelabelbinaryclassclassificationmetrics( selectedtunableparam, selectedparamtomax, metricprintnames = _singlelabelbinaryclassclassificationmetrics_tunableparam(kwargsdict) # - predictedprobabilitiesalllabels = predict_proba(estimator, featuresdf) + predictedprobabilitiesalllabels = predict_proba(estimator, features_df) yscore = Cfloat.( singlelabelbinaryyscore( predictedprobabilitiesalllabels[singlelabelname], @@ -125,7 +125,7 @@ function _singlelabelbinaryclassclassificationmetrics( ) ytrue = Int.( singlelabelbinaryytrue( - labelsdf[singlelabelname], + labels_df[singlelabelname], positiveclass, ) ) @@ -191,8 +191,8 @@ end function singlelabelbinaryclassclassificationmetrics( estimator::Fittable, - featuresdf::DataFrames.AbstractDataFrame, - labelsdf::DataFrames.AbstractDataFrame, + features_df::DataFrames.AbstractDataFrame, + labels_df::DataFrames.AbstractDataFrame, singlelabelname::Symbol, positiveclass::AbstractString; kwargs... @@ -200,8 +200,8 @@ function singlelabelbinaryclassclassificationmetrics( vectorofestimators = Fittable[estimator] result = singlelabelbinaryclassclassificationmetrics( vectorofestimators, - featuresdf, - labelsdf, + features_df, + labels_df, singlelabelname, positiveclass; kwargs... @@ -211,8 +211,8 @@ end function singlelabelbinaryclassclassificationmetrics( vectorofestimators::AbstractVector{Fittable}, - featuresdf::DataFrames.AbstractDataFrame, - labelsdf::DataFrames.AbstractDataFrame, + features_df::DataFrames.AbstractDataFrame, + labels_df::DataFrames.AbstractDataFrame, singlelabelname::Symbol, positiveclass::AbstractString; kwargs... @@ -224,8 +224,8 @@ function singlelabelbinaryclassclassificationmetrics( metricsforeachestimator = [ _singlelabelbinaryclassclassificationmetrics( est, - featuresdf, - labelsdf, + features_df, + labels_df, singlelabelname, positiveclass; kwargs... diff --git a/src/metrics/singlelabelregressionmetrics.jl b/src/metrics/singlelabelregressionmetrics.jl index 4868406af..2dc0ed6aa 100644 --- a/src/metrics/singlelabelregressionmetrics.jl +++ b/src/metrics/singlelabelregressionmetrics.jl @@ -26,14 +26,14 @@ end function _singlelabelregressionmetrics( estimator::Fittable, - featuresdf::DataFrames.AbstractDataFrame, - labelsdf::DataFrames.AbstractDataFrame, + features_df::DataFrames.AbstractDataFrame, + labels_df::DataFrames.AbstractDataFrame, singlelabelname::Symbol, ) ytrue = singlelabelregressionytrue( - labelsdf[singlelabelname], + labels_df[singlelabelname], ) - predictionsalllabels = predict(estimator, featuresdf) + predictionsalllabels = predict(estimator, features_df) ypred = singlelabelregressionypred( predictionsalllabels[singlelabelname], ) @@ -56,15 +56,15 @@ end function singlelabelregressionmetrics( estimator::Fittable, - featuresdf::DataFrames.AbstractDataFrame, - labelsdf::DataFrames.AbstractDataFrame, + features_df::DataFrames.AbstractDataFrame, + labels_df::DataFrames.AbstractDataFrame, singlelabelname::Symbol, ) vectorofestimators = Fittable[estimator] result = singlelabelregressionmetrics( vectorofestimators, - featuresdf, - labelsdf, + features_df, + labels_df, singlelabelname, ) return result @@ -72,16 +72,16 @@ end function singlelabelregressionmetrics( vectorofestimators::AbstractVector{Fittable}, - featuresdf::DataFrames.AbstractDataFrame, - labelsdf::DataFrames.AbstractDataFrame, + features_df::DataFrames.AbstractDataFrame, + labels_df::DataFrames.AbstractDataFrame, singlelabelname::Symbol; kwargs... ) metricsforeachestimator = [ _singlelabelregressionmetrics( est, - featuresdf, - labelsdf, + features_df, + labels_df, singlelabelname, ) for est in vectorofestimators diff --git a/src/modelselection/split_data.jl b/src/modelselection/split_data.jl index 37b08d75f..82b5ebe7d 100644 --- a/src/modelselection/split_data.jl +++ b/src/modelselection/split_data.jl @@ -2,14 +2,14 @@ import DataFrames import StatsBase function split_data( - featuresdf::DataFrames.AbstractDataFrame, - labelsdf::DataFrames.AbstractDataFrame, + features_df::DataFrames.AbstractDataFrame, + labels_df::DataFrames.AbstractDataFrame, split::Real, ) result = split_data( Base.GLOBAL_RNG, - featuresdf, - labelsdf, + features_df, + labels_df, split, ) return result @@ -17,19 +17,22 @@ end function split_data( rng::AbstractRNG, - featuresdf::DataFrames.AbstractDataFrame, - labelsdf::DataFrames.AbstractDataFrame, + features_df::DataFrames.AbstractDataFrame, + labels_df::DataFrames.AbstractDataFrame, split::Real, ) + # if !(0 < split < 1) error("split must be >0 and <1") end - if size(featuresdf, 1) != size(labelsdf, 1) - error("featuresdf and labelsdf do not have the same number of rows") + if size(features_df, 1) != size(labels_df, 1) + error("features_df and labels_df do not have the same number of rows") end - num_rows = size(featuresdf, 1) + # + num_rows = size(features_df, 1) num_partition_1 = round(Int, split * num_rows) num_partition_2 = num_rows - num_partition_1 + # allrows = convert(Array, 1:num_rows) partition_1_rows = StatsBase.sample( rng, @@ -38,21 +41,15 @@ function split_data( replace = false, ) partition_2_rows = setdiff(allrows, partition_1_rows) - @assert(typeof(partition_1_rows) <: AbstractVector) - @assert(typeof(partition_2_rows) <: AbstractVector) - @assert(length(partition_1_rows) == num_partition_1) - @assert(length(partition_2_rows) == num_partition_2) - @assert( - all( - allrows .== sort(vcat(partition_1_rows, partition_2_rows)) - ) - ) - partition_1_features_df = featuresdf[partition_1_rows, :] - partition_2_features_df = featuresdf[partition_2_rows, :] - partition_1_labels_df = labelsdf[partition_1_rows, :] - partition_2_labels_df = labelsdf[partition_2_rows, :] + # + partition_1_features_df = features_df[partition_1_rows, :] + partition_2_features_df = features_df[partition_2_rows, :] + # + partition_1_labels_df = labels_df[partition_1_rows, :] + partition_2_labels_df = labels_df[partition_2_rows, :] + # return partition_1_features_df, - partition_2_features_df, partition_1_labels_df, + partition_2_features_df, partition_2_labels_df end diff --git a/src/plotting/plotprcurve.jl b/src/plotting/plotprcurve.jl index 3620cbdaf..4e862b94d 100644 --- a/src/plotting/plotprcurve.jl +++ b/src/plotting/plotprcurve.jl @@ -4,16 +4,16 @@ import PGFPlotsX function plotprcurve( estimator::Fittable, - featuresdf::DataFrames.AbstractDataFrame, - labelsdf::DataFrames.AbstractDataFrame, + features_df::DataFrames.AbstractDataFrame, + labels_df::DataFrames.AbstractDataFrame, singlelabelname::Symbol, positiveclass::AbstractString, ) vectorofestimators = [estimator] result = plotprcurve( vectorofestimators, - featuresdf, - labelsdf, + features_df, + labels_df, singlelabelname, positiveclass, ) @@ -22,8 +22,8 @@ end function plotprcurve( vectorofestimators::AbstractVector{Fittable}, - featuresdf::DataFrames.AbstractDataFrame, - labelsdf::DataFrames.AbstractDataFrame, + features_df::DataFrames.AbstractDataFrame, + labels_df::DataFrames.AbstractDataFrame, singlelabelname::Symbol, positiveclass::AbstractString, ) @@ -35,8 +35,8 @@ function plotprcurve( estimator_i = vectorofestimators[i] metrics_i = _singlelabelbinaryclassclassificationmetrics( estimator_i, - featuresdf, - labelsdf, + features_df, + labels_df, singlelabelname, positiveclass; threshold = 0.5, diff --git a/src/plotting/plotroccurve.jl b/src/plotting/plotroccurve.jl index 0b88b371a..0622da28c 100644 --- a/src/plotting/plotroccurve.jl +++ b/src/plotting/plotroccurve.jl @@ -4,16 +4,16 @@ import PGFPlotsX function plotroccurve( estimator::Fittable, - featuresdf::DataFrames.AbstractDataFrame, - labelsdf::DataFrames.AbstractDataFrame, + features_df::DataFrames.AbstractDataFrame, + labels_df::DataFrames.AbstractDataFrame, singlelabelname::Symbol, positiveclass::AbstractString, ) vectorofestimators = [estimator] result = plotroccurve( vectorofestimators, - featuresdf, - labelsdf, + features_df, + labels_df, singlelabelname, positiveclass, ) @@ -22,8 +22,8 @@ end function plotroccurve( vectorofestimators::AbstractVector{Fittable}, - featuresdf::DataFrames.AbstractDataFrame, - labelsdf::DataFrames.AbstractDataFrame, + features_df::DataFrames.AbstractDataFrame, + labels_df::DataFrames.AbstractDataFrame, singlelabelname::Symbol, positiveclass::AbstractString, ) @@ -35,8 +35,8 @@ function plotroccurve( estimator_i = vectorofestimators[i] metrics_i = _singlelabelbinaryclassclassificationmetrics( estimator_i, - featuresdf, - labelsdf, + features_df, + labels_df, singlelabelname, positiveclass; threshold = 0.5, diff --git a/src/plotting/plotsinglelabelbinaryclassclassifierhistograms.jl b/src/plotting/plotsinglelabelbinaryclassclassifierhistograms.jl index 8601f688c..f8798efd7 100644 --- a/src/plotting/plotsinglelabelbinaryclassclassifierhistograms.jl +++ b/src/plotting/plotsinglelabelbinaryclassclassifierhistograms.jl @@ -4,8 +4,8 @@ import PGFPlotsX function plotsinglelabelbinaryclassclassifierhistogram( estimator::Fittable, - featuresdf::DataFrames.AbstractDataFrame, - labelsdf::DataFrames.AbstractDataFrame, + features_df::DataFrames.AbstractDataFrame, + labels_df::DataFrames.AbstractDataFrame, singlelabelname::Symbol, singlelabellevels::AbstractVector{<:AbstractString}; numbins::Integer = 25, @@ -18,7 +18,7 @@ function plotsinglelabelbinaryclassclassifierhistogram( end negativeclass = singlelabellevels[1] positiveclass = singlelabellevels[2] - predictedprobabilitiesalllabels = predict_proba(estimator, featuresdf) + predictedprobabilitiesalllabels = predict_proba(estimator, features_df) yscore = Cfloat.( singlelabelbinaryyscore( predictedprobabilitiesalllabels[singlelabelname], @@ -27,7 +27,7 @@ function plotsinglelabelbinaryclassclassifierhistogram( ) ytrue = Int.( singlelabelbinaryytrue( - labelsdf[singlelabelname], + labels_df[singlelabelname], positiveclass, ) ) diff --git a/src/plotting/plotsinglelabelregressiontruevspredicted.jl b/src/plotting/plotsinglelabelregressiontruevspredicted.jl index 61b7a1a08..55e86e1f7 100644 --- a/src/plotting/plotsinglelabelregressiontruevspredicted.jl +++ b/src/plotting/plotsinglelabelregressiontruevspredicted.jl @@ -4,15 +4,15 @@ import PGFPlotsX function plotsinglelabelregressiontrueversuspredicted( estimator::Fittable, - featuresdf::DataFrames.AbstractDataFrame, - labelsdf::DataFrames.AbstractDataFrame, + features_df::DataFrames.AbstractDataFrame, + labels_df::DataFrames.AbstractDataFrame, singlelabelname::Symbol; includeorigin::Bool = false, ) ytrue = singlelabelregressionytrue( - labelsdf[singlelabelname], + labels_df[singlelabelname], ) - predictionsalllabels = predict(estimator, featuresdf) + predictionsalllabels = predict(estimator, features_df) ypred = singlelabelregressionypred( predictionsalllabels[singlelabelname], ) diff --git a/src/plotting/probability_calibration_plots.jl b/src/plotting/probability_calibration_plots.jl index 765dccd7d..87f6dd8a3 100644 --- a/src/plotting/probability_calibration_plots.jl +++ b/src/plotting/probability_calibration_plots.jl @@ -4,8 +4,8 @@ import PGFPlotsX function probability_calibration_scores_and_fractions( estimator::Fittable, - featuresdf::DataFrames.AbstractDataFrame, - labelsdf::DataFrames.AbstractDataFrame, + features_df::DataFrames.AbstractDataFrame, + labels_df::DataFrames.AbstractDataFrame, singlelabelname::Symbol, positiveclass::AbstractString; window::Real = 0.1, @@ -13,11 +13,11 @@ function probability_calibration_scores_and_fractions( ) ytrue = Int.( singlelabelbinaryytrue( - labelsdf[singlelabelname], + labels_df[singlelabelname], positiveclass, ) ) - predictedprobabilitiesalllabels = predict_proba(estimator, featuresdf) + predictedprobabilitiesalllabels = predict_proba(estimator, features_df) yscore = Cfloat.( singlelabelbinaryyscore( predictedprobabilitiesalllabels[singlelabelname], @@ -71,8 +71,8 @@ end function plot_probability_calibration_curve( estimator::Fittable, - featuresdf::DataFrames.AbstractDataFrame, - labelsdf::DataFrames.AbstractDataFrame, + features_df::DataFrames.AbstractDataFrame, + labels_df::DataFrames.AbstractDataFrame, singlelabelname::Symbol, positiveclass::AbstractString; window::Real = 0.1, @@ -80,8 +80,8 @@ function plot_probability_calibration_curve( ) scores, fractions = probability_calibration_scores_and_fractions( estimator, - featuresdf, - labelsdf, + features_df, + labels_df, singlelabelname, positiveclass; window = window, @@ -152,8 +152,8 @@ end function probability_calibration_metrics( estimator::Fittable, - featuresdf::DataFrames.AbstractDataFrame, - labelsdf::DataFrames.AbstractDataFrame, + features_df::DataFrames.AbstractDataFrame, + labels_df::DataFrames.AbstractDataFrame, singlelabelname::Symbol, positiveclass::AbstractString; window::Real = 0.1, @@ -162,8 +162,8 @@ function probability_calibration_metrics( vectorofestimators = Fittable[estimator] result = probability_calibration_metrics( vectorofestimators, - featuresdf, - labelsdf, + features_df, + labels_df, singlelabelname, positiveclass, ) @@ -172,8 +172,8 @@ end function probability_calibration_metrics( vectorofestimators::AbstractVector{Fittable}, - featuresdf::DataFrames.AbstractDataFrame, - labelsdf::DataFrames.AbstractDataFrame, + features_df::DataFrames.AbstractDataFrame, + labels_df::DataFrames.AbstractDataFrame, singlelabelname::Symbol, positiveclass::AbstractString, window::Real = 0.1, @@ -189,13 +189,13 @@ function probability_calibration_metrics( for i = 1:length(vectorofestimators) ytrue = Int.( singlelabelbinaryytrue( - labelsdf[singlelabelname], + labels_df[singlelabelname], positiveclass, ) ) predictedprobabilitiesalllabels = predict_proba( vectorofestimators[i], - featuresdf, + features_df, ) yscore = Cfloat.( singlelabelbinaryyscore( diff --git a/src/preprocessing/dataframetodecisiontree.jl b/src/preprocessing/dataframetodecisiontree.jl index ee7213e7a..b5d32403c 100644 --- a/src/preprocessing/dataframetodecisiontree.jl +++ b/src/preprocessing/dataframetodecisiontree.jl @@ -48,12 +48,12 @@ end function transform( transformer::MutableDataFrame2DecisionTreeTransformer, - featuresdf::DataFrames.AbstractDataFrame, - labelsdf::DataFrames.AbstractDataFrame; + features_df::DataFrames.AbstractDataFrame, + labels_df::DataFrames.AbstractDataFrame; kwargs... ) singlelabelname = transformer.singlelabelname - labelsarray = convert(Array, labelsdf[singlelabelname]) + labelsarray = convert(Array, labels_df[singlelabelname]) modelformula = generate_formula( transformer.featurenames[1], transformer.featurenames; @@ -61,7 +61,7 @@ function transform( ) modelframe = StatsModels.ModelFrame( modelformula, - featuresdf; + features_df; contrasts = transformer.dffeaturecontrasts.contrasts, ) modelmatrix = StatsModels.ModelMatrix(modelframe) @@ -71,7 +71,7 @@ end function transform( transformer::MutableDataFrame2DecisionTreeTransformer, - featuresdf::DataFrames.AbstractDataFrame; + features_df::DataFrames.AbstractDataFrame; kwargs... ) modelformula = generate_formula( @@ -81,7 +81,7 @@ function transform( ) modelframe = StatsModels.ModelFrame( modelformula, - featuresdf; + features_df; contrasts = transformer.dffeaturecontrasts.contrasts, ) modelmatrix = StatsModels.ModelMatrix(modelframe) @@ -91,25 +91,25 @@ end function fit!( transformer::MutableDataFrame2DecisionTreeTransformer, - featuresdf::DataFrames.AbstractDataFrame, - labelsdf::DataFrames.AbstractDataFrame; + features_df::DataFrames.AbstractDataFrame, + labels_df::DataFrames.AbstractDataFrame; kwargs... ) - return transform(transformer, featuresdf, labelsdf) + return transform(transformer, features_df, labels_df) end function predict( transformer::MutableDataFrame2DecisionTreeTransformer, - featuresdf::DataFrames.AbstractDataFrame; + features_df::DataFrames.AbstractDataFrame; kwargs... ) - return transform(transformer, featuresdf) + return transform(transformer, features_df) end function predict_proba( transformer::MutableDataFrame2DecisionTreeTransformer, - featuresdf::DataFrames.AbstractDataFrame; + features_df::DataFrames.AbstractDataFrame; kwargs... ) - return transform(transformer, featuresdf) + return transform(transformer, features_df) end diff --git a/src/preprocessing/dataframetoglm.jl b/src/preprocessing/dataframetoglm.jl index e10aac7a6..7f9a86af0 100644 --- a/src/preprocessing/dataframetoglm.jl +++ b/src/preprocessing/dataframetoglm.jl @@ -31,49 +31,49 @@ end function transform( transformer::ImmutableDataFrame2GLMSingleLabelBinaryClassTransformer, - featuresdf::DataFrames.AbstractDataFrame, - labelsdf::DataFrames.AbstractDataFrame; + features_df::DataFrames.AbstractDataFrame, + labels_df::DataFrames.AbstractDataFrame; kwargs... ) - transformedlabelsdf = DataFrames.DataFrame() + transformedlabels_df = DataFrames.DataFrame() label = transformer.label positiveclass = transformer.positiveclass - originallabelcolumn = labelsdf[label] + originallabelcolumn = labels_df[label] transformedlabelcolumn = Int.(originallabelcolumn .== positiveclass) - transformedlabelsdf[label] = transformedlabelcolumn - return featuresdf, transformedlabelsdf + transformedlabels_df[label] = transformedlabelcolumn + return features_df, transformedlabels_df end function transform( transformer::ImmutableDataFrame2GLMSingleLabelBinaryClassTransformer, - featuresdf::DataFrames.AbstractDataFrame; + features_df::DataFrames.AbstractDataFrame; kwargs... ) - return featuresdf + return features_df end function fit!( transformer::ImmutableDataFrame2GLMSingleLabelBinaryClassTransformer, - featuresdf::DataFrames.AbstractDataFrame, - labelsdf::DataFrames.AbstractDataFrame; + features_df::DataFrames.AbstractDataFrame, + labels_df::DataFrames.AbstractDataFrame; kwargs... ) - return transform(transformer, featuresdf, labelsdf) + return transform(transformer, features_df, labels_df) end function predict( transformer::ImmutableDataFrame2GLMSingleLabelBinaryClassTransformer, - featuresdf::DataFrames.AbstractDataFrame; + features_df::DataFrames.AbstractDataFrame; kwargs... ) - return transform(transformer, featuresdf) + return transform(transformer, features_df) end function predict_proba( transformer::ImmutableDataFrame2GLMSingleLabelBinaryClassTransformer, - featuresdf::DataFrames.AbstractDataFrame; + features_df::DataFrames.AbstractDataFrame; kwargs... ) - return transform(transformer, featuresdf) + return transform(transformer, features_df) end diff --git a/src/preprocessing/dataframetoknet.jl b/src/preprocessing/dataframetoknet.jl index 409f28a8c..503a5112b 100644 --- a/src/preprocessing/dataframetoknet.jl +++ b/src/preprocessing/dataframetoknet.jl @@ -143,28 +143,28 @@ end function predict( transformer::MutableDataFrame2ClassificationKnetTransformer, - featuresdf::DataFrames.AbstractDataFrame; + features_df::DataFrames.AbstractDataFrame; kwargs... ) - return transform(transformer, featuresdf) + return transform(transformer, features_df) end function predict_proba( transformer::MutableDataFrame2ClassificationKnetTransformer, - featuresdf::DataFrames.AbstractDataFrame; + features_df::DataFrames.AbstractDataFrame; kwargs... ) - return transform(transformer, featuresdf) + return transform(transformer, features_df) end function predict( transformer::MutableDataFrame2RegressionKnetTransformer, - featuresdf::DataFrames.AbstractDataFrame; + features_df::DataFrames.AbstractDataFrame; kwargs... ) result = transform( transformer, - featuresdf; + features_df; kwargs... ) return result @@ -172,12 +172,12 @@ end function predict_proba( transformer::MutableDataFrame2RegressionKnetTransformer, - featuresdf::DataFrames.AbstractDataFrame; + features_df::DataFrames.AbstractDataFrame; kwargs... ) result = transform( transformer, - featuresdf; + features_df; kwargs... ) return result @@ -227,7 +227,7 @@ function transform( transformer.index, ) training_labels_array[:, j] = - [labelstring2intmap_j[y] for y in labelsdf[label_j]] + [labelstring2intmap_j[y] for y in labels_df[label_j]] end end modelformula = generate_formula( @@ -269,7 +269,7 @@ end function transform( transformer::MutableDataFrame2ClassificationKnetTransformer, - featuresdf::DataFrames.AbstractDataFrame; + features_df::DataFrames.AbstractDataFrame; kwargs... ) modelformula = generate_formula( @@ -279,7 +279,7 @@ function transform( ) modelframe = StatsModels.ModelFrame( modelformula, - featuresdf; + features_df; contrasts = transformer.dffeaturecontrasts.contrasts, ) modelmatrix = StatsModels.ModelMatrix(modelframe) @@ -354,7 +354,7 @@ end function transform( transformer::MutableDataFrame2RegressionKnetTransformer, - featuresdf::DataFrames.AbstractDataFrame, + features_df::DataFrames.AbstractDataFrame, kwargs... ) modelformula = generate_formula( @@ -364,7 +364,7 @@ function transform( ) modelframe = StatsModels.ModelFrame( modelformula, - featuresdf; + features_df; contrasts = transformer.dffeaturecontrasts.contrasts, ) modelmatrix = StatsModels.ModelMatrix(modelframe) diff --git a/test/cpu/functional/bostonhousing/run_bostonhousing.jl b/test/cpu/functional/bostonhousing/run_bostonhousing.jl index f2e6b8f72..8f212ff5d 100644 --- a/test/cpu/functional/bostonhousing/run_bostonhousing.jl +++ b/test/cpu/functional/bostonhousing/run_bostonhousing.jl @@ -69,31 +69,28 @@ end labelname = :MedV # Put features and labels in separate dataframes -featuresdf = df[featurenames] -labelsdf = df[[labelname]] +features_df = df[featurenames] +labels_df = df[[labelname]] # View summary statistics for label variable (mean, quartiles, etc.) -DataFrames.describe(labelsdf[labelname]) - -# Split data into training (50% of total) and non-training (50% of total) -trainingfeaturesdf, - nontrainingfeaturesdf, - traininglabelsdf, - nontraininglabelsdf = PredictMD.split_data( - featuresdf, - labelsdf, - 0.5, +DataFrames.describe(labels_df[labelname]) + +# Split the data into training (50%), validation (25%), and testing (25%) +trainingandvalidation_features_df, + trainingandvalidation_labels_df, + testing_features_df, + testing_labels_df = PredictMD.split_data( + features_df, + labels_df, + 0.75, # 75% training+validation, 25% testing ) - -# Split non-training data into validation (25% of total) and # testing (25% -# of total) -validationfeaturesdf, - testingfeaturesdf, - validationlabelsdf, - testinglabelsdf = PredictMD.split_data( - nontrainingfeaturesdf, - nontraininglabelsdf, - 0.5, +training_features_df, + training_labels_df, + validation_features_df, + validation_labels_df = PredictMD.split_data( + trainingandvalidation_features_df, + trainingandvalidation_labels_df, + 2/3, # 2/3 of 75% = 50% training, 1/3 of 75% = 25% validation ) ############################################################################## @@ -123,7 +120,7 @@ end if get(ENV, "LOADTRAINEDMODELSFROMFILE", "") == "true" else # Train linear regression model - PredictMD.fit!(linearreg,trainingfeaturesdf,traininglabelsdf,) + PredictMD.fit!(linearreg,training_features_df,training_labels_df,) end # View coefficients, p values, etc. for underlying linear regression @@ -132,8 +129,8 @@ PredictMD.get_underlying(linearreg) # Plot true values versus predicted values for linear regression on training set linearreg_plot_training = PredictMD.plotsinglelabelregressiontrueversuspredicted( linearreg, - trainingfeaturesdf, - traininglabelsdf, + training_features_df, + training_labels_df, labelname, ) PredictMD.open_plot(linearreg_plot_training) @@ -141,8 +138,8 @@ PredictMD.open_plot(linearreg_plot_training) # Plot true values versus predicted values for linear regression on testing set linearreg_plot_testing = PredictMD.plotsinglelabelregressiontrueversuspredicted( linearreg, - testingfeaturesdf, - testinglabelsdf, + testing_features_df, + testing_labels_df, labelname ) PredictMD.open_plot(linearreg_plot_testing) @@ -150,16 +147,16 @@ PredictMD.open_plot(linearreg_plot_testing) # Evaluate performance of linear regression on training set PredictMD.singlelabelregressionmetrics( linearreg, - trainingfeaturesdf, - traininglabelsdf, + training_features_df, + training_labels_df, labelname, ) # Evaluate performance of linear regression on testing set PredictMD.singlelabelregressionmetrics( linearreg, - testingfeaturesdf, - testinglabelsdf, + testing_features_df, + testing_labels_df, labelname, ) @@ -185,14 +182,14 @@ end if get(ENV, "LOADTRAINEDMODELSFROMFILE", "") == "true" else # Train random forest model on training set - PredictMD.fit!(randomforestreg,trainingfeaturesdf,traininglabelsdf,) + PredictMD.fit!(randomforestreg,training_features_df,training_labels_df,) end # Plot true values versus predicted values for random forest on training set randomforestreg_plot_training = PredictMD.plotsinglelabelregressiontrueversuspredicted( randomforestreg, - trainingfeaturesdf, - traininglabelsdf, + training_features_df, + training_labels_df, labelname, ) PredictMD.open_plot(randomforestreg_plot_training) @@ -200,8 +197,8 @@ PredictMD.open_plot(randomforestreg_plot_training) # Plot true values versus predicted values for random forest on testing set randomforestreg_plot_testing = PredictMD.plotsinglelabelregressiontrueversuspredicted( randomforestreg, - testingfeaturesdf, - testinglabelsdf, + testing_features_df, + testing_labels_df, labelname, ) PredictMD.open_plot(randomforestreg_plot_testing) @@ -209,16 +206,16 @@ PredictMD.open_plot(randomforestreg_plot_testing) # Evaluate performance of random forest on training set PredictMD.singlelabelregressionmetrics( randomforestreg, - trainingfeaturesdf, - traininglabelsdf, + training_features_df, + training_labels_df, labelname, ) # Evaluate performance of random forest on testing set PredictMD.singlelabelregressionmetrics( randomforestreg, - testingfeaturesdf, - testinglabelsdf, + testing_features_df, + testing_labels_df, labelname, ) @@ -245,14 +242,14 @@ end if get(ENV, "LOADTRAINEDMODELSFROMFILE", "") == "true" else # Train epsilon-SVR model on training set - PredictMD.fit!(epsilonsvr_svmreg,trainingfeaturesdf,traininglabelsdf,) + PredictMD.fit!(epsilonsvr_svmreg,training_features_df,training_labels_df,) end # Plot true values versus predicted values for epsilon-SVR on training set epsilonsvr_svmreg_plot_training = PredictMD.plotsinglelabelregressiontrueversuspredicted( epsilonsvr_svmreg, - trainingfeaturesdf, - traininglabelsdf, + training_features_df, + training_labels_df, labelname, ) PredictMD.open_plot(epsilonsvr_svmreg_plot_training) @@ -260,8 +257,8 @@ PredictMD.open_plot(epsilonsvr_svmreg_plot_training) # Plot true values versus predicted values for epsilon-SVR on testing set epsilonsvr_svmreg_plot_testing = PredictMD.plotsinglelabelregressiontrueversuspredicted( epsilonsvr_svmreg, - testingfeaturesdf, - testinglabelsdf, + testing_features_df, + testing_labels_df, labelname, ) PredictMD.open_plot(epsilonsvr_svmreg_plot_testing) @@ -269,16 +266,16 @@ PredictMD.open_plot(epsilonsvr_svmreg_plot_testing) # Evaluate performance of epsilon-SVR on training set PredictMD.singlelabelregressionmetrics( epsilonsvr_svmreg, - trainingfeaturesdf, - traininglabelsdf, + training_features_df, + training_labels_df, labelname, ) # Evaluate performance of epsilon-SVR on testing set PredictMD.singlelabelregressionmetrics( epsilonsvr_svmreg, - testingfeaturesdf, - testinglabelsdf, + testing_features_df, + testing_labels_df, labelname, ) @@ -305,14 +302,14 @@ end if get(ENV, "LOADTRAINEDMODELSFROMFILE", "") == "true" else # Train nu-SVR model - PredictMD.fit!(nusvr_svmreg,trainingfeaturesdf,traininglabelsdf,) + PredictMD.fit!(nusvr_svmreg,training_features_df,training_labels_df,) end # Plot true values versus predicted values for nu-SVR on training set nusvr_svmreg_plot_training = PredictMD.plotsinglelabelregressiontrueversuspredicted( nusvr_svmreg, - trainingfeaturesdf, - traininglabelsdf, + training_features_df, + training_labels_df, labelname, ) PredictMD.open_plot(nusvr_svmreg_plot_training) @@ -320,8 +317,8 @@ PredictMD.open_plot(nusvr_svmreg_plot_training) # Plot true values versus predicted values for nu-SVR on testing set nusvr_svmreg_plot_testing = PredictMD.plotsinglelabelregressiontrueversuspredicted( nusvr_svmreg, - testingfeaturesdf, - testinglabelsdf, + testing_features_df, + testing_labels_df, labelname, ) PredictMD.open_plot(nusvr_svmreg_plot_testing) @@ -329,16 +326,16 @@ PredictMD.open_plot(nusvr_svmreg_plot_testing) # Evaluate performance of nu-SVR on training set PredictMD.singlelabelregressionmetrics( nusvr_svmreg, - trainingfeaturesdf, - traininglabelsdf, + training_features_df, + training_labels_df, labelname, ) # Evaluate performance of nu-SVR on testing set PredictMD.singlelabelregressionmetrics( nusvr_svmreg, - testingfeaturesdf, - testinglabelsdf, + testing_features_df, + testing_labels_df, labelname, ) @@ -446,10 +443,10 @@ else # Train multilayer perceptron model on training set PredictMD.fit!( knetmlpreg, - trainingfeaturesdf, - traininglabelsdf, - validationfeaturesdf, - validationlabelsdf, + training_features_df, + training_labels_df, + validation_features_df, + validation_labels_df, ) end @@ -492,8 +489,8 @@ PredictMD.open_plot(knet_learningcurve_lossvsiteration_skip100iterations) # Plot true values versus predicted values for multilayer perceptron on training set knetmlpreg_plot_training = PredictMD.plotsinglelabelregressiontrueversuspredicted( knetmlpreg, - trainingfeaturesdf, - traininglabelsdf, + training_features_df, + training_labels_df, labelname, ) PredictMD.open_plot(knetmlpreg_plot_training) @@ -501,8 +498,8 @@ PredictMD.open_plot(knetmlpreg_plot_training) # Plot true values versus predicted values for multilayer perceptron on testing set knetmlpreg_plot_testing = PredictMD.plotsinglelabelregressiontrueversuspredicted( knetmlpreg, - testingfeaturesdf, - testinglabelsdf, + testing_features_df, + testing_labels_df, labelname, ) PredictMD.open_plot(knetmlpreg_plot_testing) @@ -510,16 +507,16 @@ PredictMD.open_plot(knetmlpreg_plot_testing) # Evaluate performance of multilayer perceptron on training set PredictMD.singlelabelregressionmetrics( knetmlpreg, - trainingfeaturesdf, - traininglabelsdf, + training_features_df, + training_labels_df, labelname, ) # Evaluate performance of multilayer perceptron on testing set PredictMD.singlelabelregressionmetrics( knetmlpreg, - testingfeaturesdf, - testinglabelsdf, + testing_features_df, + testing_labels_df, labelname, ) @@ -540,16 +537,16 @@ all_models = PredictMD.Fittable[ # Compare performance of all five models on training set showall(PredictMD.singlelabelregressionmetrics( all_models, - trainingfeaturesdf, - traininglabelsdf, + training_features_df, + training_labels_df, labelname, )) # Compare performance of all models on testing set showall(PredictMD.singlelabelregressionmetrics( all_models, - testingfeaturesdf, - testinglabelsdf, + testing_features_df, + testing_labels_df, labelname, )) @@ -577,15 +574,15 @@ end # output by each of regression models. # Get real-valued predictions from each model for training set -PredictMD.predict(linearreg,trainingfeaturesdf,) -PredictMD.predict(randomforestreg,trainingfeaturesdf,) -PredictMD.predict(epsilonsvr_svmreg,trainingfeaturesdf,) -PredictMD.predict(nusvr_svmreg,trainingfeaturesdf,) -PredictMD.predict(knetmlpreg,trainingfeaturesdf,) +PredictMD.predict(linearreg,training_features_df,) +PredictMD.predict(randomforestreg,training_features_df,) +PredictMD.predict(epsilonsvr_svmreg,training_features_df,) +PredictMD.predict(nusvr_svmreg,training_features_df,) +PredictMD.predict(knetmlpreg,training_features_df,) # Get real-valued predictions from each model for testing set -PredictMD.predict(linearreg,testingfeaturesdf,) -PredictMD.predict(randomforestreg,testingfeaturesdf,) -PredictMD.predict(epsilonsvr_svmreg,testingfeaturesdf,) -PredictMD.predict(nusvr_svmreg,testingfeaturesdf,) -PredictMD.predict(knetmlpreg,testingfeaturesdf,) +PredictMD.predict(linearreg,testing_features_df,) +PredictMD.predict(randomforestreg,testing_features_df,) +PredictMD.predict(epsilonsvr_svmreg,testing_features_df,) +PredictMD.predict(nusvr_svmreg,testing_features_df,) +PredictMD.predict(knetmlpreg,testing_features_df,) diff --git a/test/cpu/functional/breastcancerbiopsy/run_breastcancerbiopsy.jl b/test/cpu/functional/breastcancerbiopsy/run_breastcancerbiopsy.jl index b44d75554..5cd202640 100644 --- a/test/cpu/functional/breastcancerbiopsy/run_breastcancerbiopsy.jl +++ b/test/cpu/functional/breastcancerbiopsy/run_breastcancerbiopsy.jl @@ -64,28 +64,25 @@ positiveclass = "malignant" labellevels = [negativeclass, positiveclass] # Put features and labels in separate dataframes -featuresdf = df[featurenames] -labelsdf = df[[labelname]] - -# Split data into training (50% of total) and non-training (50% of total) -trainingfeaturesdf, - nontrainingfeaturesdf, - traininglabelsdf, - nontraininglabelsdf = PredictMD.split_data( - featuresdf, - labelsdf, - 0.5, +features_df = df[featurenames] +labels_df = df[[labelname]] + +# Split the data into training (50%), validation (25%), and testing (25%) +trainingandvalidation_features_df, + trainingandvalidation_labels_df, + testing_features_df, + testing_labels_df = PredictMD.split_data( + features_df, + labels_df, + 0.75, # 75% training+validation, 25% testing ) - -# Split non-training data into validation (25% of total) and testing (25% -# of total) -validationfeaturesdf, - testingfeaturesdf, - validationlabelsdf, - testinglabelsdf = PredictMD.split_data( - nontrainingfeaturesdf, - nontraininglabelsdf, - 0.5, +training_features_df, + training_labels_df, + validation_features_df, + validation_labels_df = PredictMD.split_data( + trainingandvalidation_features_df, + trainingandvalidation_labels_df, + 2/3, # 2/3 of 75% = 50% training, 1/3 of 75% = 25% validation ) ############################################################################## @@ -95,8 +92,8 @@ validationfeaturesdf, ############################################################################## # Examine prevalence of each class in training set -# DataFrames.describe(traininglabelsdf[labelname]) -StatsBase.countmap(traininglabelsdf[labelname]) +# DataFrames.describe(training_labels_df[labelname]) +StatsBase.countmap(training_labels_df[labelname]) # We see that malignant is minority class and benign is majority class. # The ratio of malignant:benign is somewhere between 1:2.5 and 1:3 (depending @@ -106,9 +103,9 @@ StatsBase.countmap(traininglabelsdf[labelname]) majorityclass = "benign" minorityclass = "malignant" -smotedtrainingfeaturesdf, smotedtraininglabelsdf = PredictMD.smote( - trainingfeaturesdf, - traininglabelsdf, +smoted_training_features_df, smoted_training_labels_df = PredictMD.smote( + training_features_df, + training_labels_df, featurenames, labelname; majorityclass = majorityclass, @@ -119,8 +116,8 @@ smotedtrainingfeaturesdf, smotedtraininglabelsdf = PredictMD.smote( ) # Examine prevalence of each class in smoted training set -# DataFrames.describe(smotedtraininglabelsdf[labelname]) -StatsBase.countmap(smotedtraininglabelsdf[labelname]) +# DataFrames.describe(smoted_training_labels_df[labelname]) +StatsBase.countmap(smoted_training_labels_df[labelname]) # Now we have a ratio of malignant:benign that is 1:1. @@ -154,8 +151,8 @@ else # Train logistic classifier model on smoted training set PredictMD.fit!( logisticclassifier, - smotedtrainingfeaturesdf, - smotedtraininglabelsdf, + smoted_training_features_df, + smoted_training_labels_df, ) end @@ -165,8 +162,8 @@ PredictMD.get_underlying(logisticclassifier) # Plot classifier histogram for logistic classifier on smoted training set logistic_hist_training = PredictMD.plotsinglelabelbinaryclassclassifierhistogram( logisticclassifier, - smotedtrainingfeaturesdf, - smotedtraininglabelsdf, + smoted_training_features_df, + smoted_training_labels_df, labelname, labellevels, ) @@ -175,8 +172,8 @@ PredictMD.open_plot(logistic_hist_training) # Plot classifier histogram for logistic classifier on testing set logistic_hist_testing = PredictMD.plotsinglelabelbinaryclassclassifierhistogram( logisticclassifier, - testingfeaturesdf, - testinglabelsdf, + testing_features_df, + testing_labels_df, labelname, labellevels, ) @@ -185,8 +182,8 @@ PredictMD.open_plot(logistic_hist_testing) # Evaluate performance of logistic classifier on smoted training set PredictMD.singlelabelbinaryclassclassificationmetrics( logisticclassifier, - testingfeaturesdf, - testinglabelsdf, + testing_features_df, + testing_labels_df, labelname, positiveclass; sensitivity = 0.95, @@ -195,8 +192,8 @@ PredictMD.singlelabelbinaryclassclassificationmetrics( # Evaluate performance of logistic classifier on testing set PredictMD.singlelabelbinaryclassclassificationmetrics( logisticclassifier, - testingfeaturesdf, - testinglabelsdf, + testing_features_df, + testing_labels_df, labelname, positiveclass; sensitivity = 0.95, @@ -204,8 +201,8 @@ PredictMD.singlelabelbinaryclassclassificationmetrics( logistic_calibration_curve = PredictMD.plot_probability_calibration_curve( logisticclassifier, - smotedtrainingfeaturesdf, - smotedtraininglabelsdf, + smoted_training_features_df, + smoted_training_labels_df, labelname, positiveclass; window = 0.2, @@ -214,8 +211,8 @@ PredictMD.open_plot(logistic_calibration_curve) PredictMD.probability_calibration_metrics( logisticclassifier, - testingfeaturesdf, - testinglabelsdf, + testing_features_df, + testing_labels_df, labelname, positiveclass; window = 0.1, @@ -223,8 +220,8 @@ PredictMD.probability_calibration_metrics( logistic_cutoffs, logistic_risk_group_prevalences = PredictMD.risk_score_cutoff_values( logisticclassifier, - testingfeaturesdf, - testinglabelsdf, + testing_features_df, + testing_labels_df, labelname, positiveclass; average_function = mean, @@ -239,8 +236,8 @@ println( showall(logistic_risk_group_prevalences) logistic_cutoffs, logistic_risk_group_prevalences = PredictMD.risk_score_cutoff_values( logisticclassifier, - testingfeaturesdf, - testinglabelsdf, + testing_features_df, + testing_labels_df, labelname, positiveclass; average_function = median, @@ -279,16 +276,16 @@ else # Train random forest classifier model on smoted training set PredictMD.fit!( rfclassifier, - smotedtrainingfeaturesdf, - smotedtraininglabelsdf, + smoted_training_features_df, + smoted_training_labels_df, ) end # Plot classifier histogram for random forest classifier on smoted training set rfclassifier_hist_training = PredictMD.plotsinglelabelbinaryclassclassifierhistogram( rfclassifier, - smotedtrainingfeaturesdf, - smotedtraininglabelsdf, + smoted_training_features_df, + smoted_training_labels_df, labelname, labellevels, ) @@ -297,8 +294,8 @@ PredictMD.open_plot(rfclassifier_hist_training) # Plot classifier histogram for random forest classifier on testing set rfclassifier_hist_testing = PredictMD.plotsinglelabelbinaryclassclassifierhistogram( rfclassifier, - testingfeaturesdf, - testinglabelsdf, + testing_features_df, + testing_labels_df, labelname, labellevels, ) @@ -307,8 +304,8 @@ PredictMD.open_plot(rfclassifier_hist_testing) # Evaluate performance of random forest classifier on smoted training set PredictMD.singlelabelbinaryclassclassificationmetrics( rfclassifier, - smotedtrainingfeaturesdf, - smotedtraininglabelsdf, + smoted_training_features_df, + smoted_training_labels_df, labelname, positiveclass; sensitivity = 0.95, @@ -317,8 +314,8 @@ PredictMD.singlelabelbinaryclassclassificationmetrics( # Evaluate performance of random forest on testing set PredictMD.singlelabelbinaryclassclassificationmetrics( rfclassifier, - testingfeaturesdf, - testinglabelsdf, + testing_features_df, + testing_labels_df, labelname, positiveclass; sensitivity = 0.95, @@ -326,8 +323,8 @@ PredictMD.singlelabelbinaryclassclassificationmetrics( rf_calibration_curve = PredictMD.plot_probability_calibration_curve( rfclassifier, - testingfeaturesdf, - testinglabelsdf, + testing_features_df, + testing_labels_df, labelname, positiveclass; window = 0.1, @@ -359,16 +356,16 @@ else # Train C-SVC model on smoted training set PredictMD.fit!( csvc_svmclassifier, - smotedtrainingfeaturesdf, - smotedtraininglabelsdf, + smoted_training_features_df, + smoted_training_labels_df, ) end # Plot classifier histogram for C-SVC on smoted training set csvc_svmclassifier_hist_training = PredictMD.plotsinglelabelbinaryclassclassifierhistogram( csvc_svmclassifier, - smotedtrainingfeaturesdf, - smotedtraininglabelsdf, + smoted_training_features_df, + smoted_training_labels_df, labelname, labellevels, ) @@ -377,8 +374,8 @@ PredictMD.open_plot(csvc_svmclassifier_hist_training) # Plot classifier histogram for C-SVC on testing set csvc_svmclassifier_hist_testing = PredictMD.plotsinglelabelbinaryclassclassifierhistogram( csvc_svmclassifier, - testingfeaturesdf, - testinglabelsdf, + testing_features_df, + testing_labels_df, labelname, labellevels, ) @@ -387,8 +384,8 @@ PredictMD.open_plot(csvc_svmclassifier_hist_testing) # Evaluate performance of C-SVC on smoted training set PredictMD.singlelabelbinaryclassclassificationmetrics( csvc_svmclassifier, - smotedtrainingfeaturesdf, - smotedtraininglabelsdf, + smoted_training_features_df, + smoted_training_labels_df, labelname, positiveclass; sensitivity = 0.95, @@ -397,8 +394,8 @@ PredictMD.singlelabelbinaryclassclassificationmetrics( # Evaluate performance of C-SVC on testing set PredictMD.singlelabelbinaryclassclassificationmetrics( csvc_svmclassifier, - testingfeaturesdf, - testinglabelsdf, + testing_features_df, + testing_labels_df, labelname, positiveclass; sensitivity = 0.95, @@ -429,16 +426,16 @@ else # Train nu-SVC model on smoted training set PredictMD.fit!( nusvc_svmclassifier, - smotedtrainingfeaturesdf, - smotedtraininglabelsdf, + smoted_training_features_df, + smoted_training_labels_df, ) end # Plot classifier histogram for nu-SVC on smoted training set nusvc_svmclassifier_hist_training = PredictMD.plotsinglelabelbinaryclassclassifierhistogram( nusvc_svmclassifier, - smotedtrainingfeaturesdf, - smotedtraininglabelsdf, + smoted_training_features_df, + smoted_training_labels_df, labelname, labellevels, ) @@ -447,8 +444,8 @@ PredictMD.open_plot(nusvc_svmclassifier_hist_training) # Plot classifier histogram for nu-SVC on testing set nusvc_svmclassifier_hist_testing = PredictMD.plotsinglelabelbinaryclassclassifierhistogram( nusvc_svmclassifier, - testingfeaturesdf, - testinglabelsdf, + testing_features_df, + testing_labels_df, labelname, labellevels, ) @@ -457,8 +454,8 @@ PredictMD.open_plot(nusvc_svmclassifier_hist_testing) # Evaluate performance of nu-SVC on smoted training set PredictMD.singlelabelbinaryclassclassificationmetrics( nusvc_svmclassifier, - smotedtrainingfeaturesdf, - smotedtraininglabelsdf, + smoted_training_features_df, + smoted_training_labels_df, labelname, positiveclass; sensitivity = 0.95, @@ -467,8 +464,8 @@ PredictMD.singlelabelbinaryclassclassificationmetrics( # Evaluate performance of SVM on testing set PredictMD.singlelabelbinaryclassclassificationmetrics( nusvc_svmclassifier, - testingfeaturesdf, - testinglabelsdf, + testing_features_df, + testing_labels_df, labelname, positiveclass; sensitivity = 0.95, @@ -599,10 +596,10 @@ else # Train multilayer perceptron model on training set PredictMD.fit!( knetmlpclassifier, - smotedtrainingfeaturesdf, - smotedtraininglabelsdf, - validationfeaturesdf, - validationlabelsdf, + smoted_training_features_df, + smoted_training_labels_df, + validation_features_df, + validation_labels_df, ) end @@ -645,8 +642,8 @@ PredictMD.open_plot(knet_learningcurve_lossvsiteration_skip100iterations) # Plot classifier histogram for multilayer perceptron on smoted training set knetmlpclassifier_hist_training = PredictMD.plotsinglelabelbinaryclassclassifierhistogram( knetmlpclassifier, - smotedtrainingfeaturesdf, - smotedtraininglabelsdf, + smoted_training_features_df, + smoted_training_labels_df, labelname, labellevels, ) @@ -655,8 +652,8 @@ PredictMD.open_plot(knetmlpclassifier_hist_training) # Plot classifier histogram for multilayer perceptron on testing set knetmlpclassifier_hist_testing = PredictMD.plotsinglelabelbinaryclassclassifierhistogram( knetmlpclassifier, - testingfeaturesdf, - testinglabelsdf, + testing_features_df, + testing_labels_df, labelname, labellevels, ) @@ -665,8 +662,8 @@ PredictMD.open_plot(knetmlpclassifier_hist_testing) # Evaluate performance of multilayer perceptron on smoted training set PredictMD.singlelabelbinaryclassclassificationmetrics( knetmlpclassifier, - smotedtrainingfeaturesdf, - smotedtraininglabelsdf, + smoted_training_features_df, + smoted_training_labels_df, labelname, positiveclass; sensitivity = 0.95, @@ -675,8 +672,8 @@ PredictMD.singlelabelbinaryclassclassificationmetrics( # Evaluate performance of multilayer perceptron on testing set PredictMD.singlelabelbinaryclassclassificationmetrics( knetmlpclassifier, - testingfeaturesdf, - testinglabelsdf, + testing_features_df, + testing_labels_df, labelname, positiveclass; sensitivity = 0.95, @@ -699,32 +696,32 @@ all_models = PredictMD.Fittable[ # Compare performance of all models on smoted training set showall(PredictMD.singlelabelbinaryclassclassificationmetrics( all_models, - trainingfeaturesdf, - traininglabelsdf, + training_features_df, + training_labels_df, labelname, positiveclass; sensitivity = 0.95, )) showall(PredictMD.singlelabelbinaryclassclassificationmetrics( all_models, - trainingfeaturesdf, - traininglabelsdf, + training_features_df, + training_labels_df, labelname, positiveclass; specificity = 0.95, )) showall(PredictMD.singlelabelbinaryclassclassificationmetrics( all_models, - trainingfeaturesdf, - traininglabelsdf, + training_features_df, + training_labels_df, labelname, positiveclass; maximize = :f1score, )) showall(PredictMD.singlelabelbinaryclassclassificationmetrics( all_models, - trainingfeaturesdf, - traininglabelsdf, + training_features_df, + training_labels_df, labelname, positiveclass; maximize = :cohen_kappa, @@ -733,32 +730,32 @@ showall(PredictMD.singlelabelbinaryclassclassificationmetrics( # Compare performance of all models on testing set showall(PredictMD.singlelabelbinaryclassclassificationmetrics( all_models, - testingfeaturesdf, - testinglabelsdf, + testing_features_df, + testing_labels_df, labelname, positiveclass; sensitivity = 0.95, )) showall(PredictMD.singlelabelbinaryclassclassificationmetrics( all_models, - testingfeaturesdf, - testinglabelsdf, + testing_features_df, + testing_labels_df, labelname, positiveclass; specificity = 0.95, )) showall(PredictMD.singlelabelbinaryclassclassificationmetrics( all_models, - testingfeaturesdf, - testinglabelsdf, + testing_features_df, + testing_labels_df, labelname, positiveclass; maximize = :f1score, )) showall(PredictMD.singlelabelbinaryclassclassificationmetrics( all_models, - testingfeaturesdf, - testinglabelsdf, + testing_features_df, + testing_labels_df, labelname, positiveclass; maximize = :cohen_kappa, @@ -767,8 +764,8 @@ showall(PredictMD.singlelabelbinaryclassclassificationmetrics( # Plot receiver operating characteristic curves for all models on testing set. rocplottesting = PredictMD.plotroccurves( all_models, - testingfeaturesdf, - testinglabelsdf, + testing_features_df, + testing_labels_df, labelname, positiveclass, ) @@ -777,8 +774,8 @@ PredictMD.open_plot(rocplottesting) # Plot precision-recall curves for all models on testing set. prplottesting = PredictMD.plotprcurves( all_models, - testingfeaturesdf, - testinglabelsdf, + testing_features_df, + testing_labels_df, labelname, positiveclass, ) @@ -808,18 +805,18 @@ end # by each of the classification models. # Get probabilities from each model for smoted training set -PredictMD.predict_proba(logisticclassifier,smotedtrainingfeaturesdf,) -PredictMD.predict_proba(rfclassifier,smotedtrainingfeaturesdf,) -PredictMD.predict_proba(csvc_svmclassifier,smotedtrainingfeaturesdf,) -PredictMD.predict_proba(nusvc_svmclassifier,smotedtrainingfeaturesdf,) -PredictMD.predict_proba(knetmlpclassifier,smotedtrainingfeaturesdf,) +PredictMD.predict_proba(logisticclassifier,smoted_training_features_df,) +PredictMD.predict_proba(rfclassifier,smoted_training_features_df,) +PredictMD.predict_proba(csvc_svmclassifier,smoted_training_features_df,) +PredictMD.predict_proba(nusvc_svmclassifier,smoted_training_features_df,) +PredictMD.predict_proba(knetmlpclassifier,smoted_training_features_df,) # Get probabilities from each model for testing set -PredictMD.predict_proba(logisticclassifier,testingfeaturesdf,) -PredictMD.predict_proba(rfclassifier,testingfeaturesdf,) -PredictMD.predict_proba(csvc_svmclassifier,testingfeaturesdf,) -PredictMD.predict_proba(nusvc_svmclassifier,testingfeaturesdf,) -PredictMD.predict_proba(knetmlpclassifier,testingfeaturesdf,) +PredictMD.predict_proba(logisticclassifier,testing_features_df,) +PredictMD.predict_proba(rfclassifier,testing_features_df,) +PredictMD.predict_proba(csvc_svmclassifier,testing_features_df,) +PredictMD.predict_proba(nusvc_svmclassifier,testing_features_df,) +PredictMD.predict_proba(knetmlpclassifier,testing_features_df,) # If we want to get predicted classes instead of probabilities, we can use the # PredictMD.predict() function to get the class predictions output by each of the @@ -828,15 +825,15 @@ PredictMD.predict_proba(knetmlpclassifier,testingfeaturesdf,) # equivalent to using a threshold of 0.5. # Get class predictions from each model for smoted training set -PredictMD.predict(logisticclassifier,smotedtrainingfeaturesdf,) -PredictMD.predict(rfclassifier,smotedtrainingfeaturesdf,) -PredictMD.predict(csvc_svmclassifier,smotedtrainingfeaturesdf,) -PredictMD.predict(nusvc_svmclassifier,smotedtrainingfeaturesdf,) -PredictMD.predict(knetmlpclassifier,smotedtrainingfeaturesdf,) +PredictMD.predict(logisticclassifier,smoted_training_features_df,) +PredictMD.predict(rfclassifier,smoted_training_features_df,) +PredictMD.predict(csvc_svmclassifier,smoted_training_features_df,) +PredictMD.predict(nusvc_svmclassifier,smoted_training_features_df,) +PredictMD.predict(knetmlpclassifier,smoted_training_features_df,) # Get class predictions from each model for testing set -PredictMD.predict(logisticclassifier,testingfeaturesdf,) -PredictMD.predict(rfclassifier,testingfeaturesdf,) -PredictMD.predict(csvc_svmclassifier,testingfeaturesdf,) -PredictMD.predict(nusvc_svmclassifier,testingfeaturesdf,) -PredictMD.predict(knetmlpclassifier,testingfeaturesdf,) +PredictMD.predict(logisticclassifier,testing_features_df,) +PredictMD.predict(rfclassifier,testing_features_df,) +PredictMD.predict(csvc_svmclassifier,testing_features_df,) +PredictMD.predict(nusvc_svmclassifier,testing_features_df,) +PredictMD.predict(knetmlpclassifier,testing_features_df,) From 1834929a78496c930e2b9596e520c1edd7febe45 Mon Sep 17 00:00:00 2001 From: Dilum Aluthge Date: Sun, 20 May 2018 00:16:37 -0400 Subject: [PATCH 07/62] Uncomment "import BSON" line --- src/io/saveload.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/io/saveload.jl b/src/io/saveload.jl index 924003b28..f7ea8f46d 100644 --- a/src/io/saveload.jl +++ b/src/io/saveload.jl @@ -1,4 +1,4 @@ -# import BSON +import BSON import FileIO import JLD2 import ProgressMeter From 03585b1f0f0249410b5f40e5be76541dae32926a Mon Sep 17 00:00:00 2001 From: Dilum Aluthge Date: Sun, 20 May 2018 00:18:22 -0400 Subject: [PATCH 08/62] Delete test.bson --- test.bson | 0 1 file changed, 0 insertions(+), 0 deletions(-) delete mode 100644 test.bson diff --git a/test.bson b/test.bson deleted file mode 100644 index e69de29bb..000000000 From e62fcd2ed9ce0db3d2ee56c1fb7e59ec4ec8472a Mon Sep 17 00:00:00 2001 From: Dilum Aluthge Date: Sun, 20 May 2018 00:23:27 -0400 Subject: [PATCH 09/62] Remove SVM regression examples (epsilon-SVR and nu-SVR) from tests --- test/cpu/functional/bostonhousing/setup_bostonhousing.jl | 8 -------- 1 file changed, 8 deletions(-) diff --git a/test/cpu/functional/bostonhousing/setup_bostonhousing.jl b/test/cpu/functional/bostonhousing/setup_bostonhousing.jl index 6cf1cad22..2194d7243 100644 --- a/test/cpu/functional/bostonhousing/setup_bostonhousing.jl +++ b/test/cpu/functional/bostonhousing/setup_bostonhousing.jl @@ -1,13 +1,9 @@ ENV["linearreg_filename"] = string(tempname(), "_linearreg.jld2") ENV["randomforestreg_filename"] = string(tempname(), "_randomforestreg.jld2") -ENV["epsilonsvr_svmreg_filename"] = string(tempname(), "_epsilonsvr_svmreg.jld2") -ENV["nusvr_svmreg_filename"] = string(tempname(), "_nusvr_svmreg.jld2") ENV["knetmlpreg_filename"] = string(tempname(), "_knetmlpreg.jld2") Base.Test.@test(!isfile(ENV["linearreg_filename"])) Base.Test.@test(!isfile(ENV["randomforestreg_filename"])) -Base.Test.@test(!isfile(ENV["epsilonsvr_svmreg_filename"])) -Base.Test.@test(!isfile(ENV["nusvr_svmreg_filename"])) Base.Test.@test(!isfile(ENV["knetmlpreg_filename"])) ENV["LOADTRAINEDMODELSFROMFILE"] = "false" @@ -16,8 +12,6 @@ include("run_bostonhousing.jl") Base.Test.@test(isfile(ENV["linearreg_filename"])) Base.Test.@test(isfile(ENV["randomforestreg_filename"])) -Base.Test.@test(isfile(ENV["epsilonsvr_svmreg_filename"])) -Base.Test.@test(isfile(ENV["nusvr_svmreg_filename"])) Base.Test.@test(isfile(ENV["knetmlpreg_filename"])) ENV["LOADTRAINEDMODELSFROMFILE"] = "true" @@ -26,6 +20,4 @@ include("run_bostonhousing.jl") Base.Test.@test(isfile(ENV["linearreg_filename"])) Base.Test.@test(isfile(ENV["randomforestreg_filename"])) -Base.Test.@test(isfile(ENV["epsilonsvr_svmreg_filename"])) -Base.Test.@test(isfile(ENV["nusvr_svmreg_filename"])) Base.Test.@test(isfile(ENV["knetmlpreg_filename"])) From 9b0b414730ab42543336c65a338622c6ef821beb Mon Sep 17 00:00:00 2001 From: Dilum Aluthge Date: Sun, 20 May 2018 00:26:46 -0400 Subject: [PATCH 10/62] Remove SVM regression examples (epsilon SVR and nu SVR) from tests --- .../bostonhousing/run_bostonhousing.jl | 122 ------------------ 1 file changed, 122 deletions(-) diff --git a/test/cpu/functional/bostonhousing/run_bostonhousing.jl b/test/cpu/functional/bostonhousing/run_bostonhousing.jl index 8f212ff5d..cb5c077cf 100644 --- a/test/cpu/functional/bostonhousing/run_bostonhousing.jl +++ b/test/cpu/functional/bostonhousing/run_bostonhousing.jl @@ -1,7 +1,5 @@ linearreg_filename = ENV["linearreg_filename"] randomforestreg_filename = ENV["randomforestreg_filename"] -epsilonsvr_svmreg_filename = ENV["epsilonsvr_svmreg_filename"] -nusvr_svmreg_filename = ENV["nusvr_svmreg_filename"] knetmlpreg_filename = ENV["knetmlpreg_filename"] ############################################################################## @@ -219,126 +217,6 @@ PredictMD.singlelabelregressionmetrics( labelname, ) -############################################################################## -## Support vector machine (epsilon support vector regression) ################ -############################################################################## - -if get(ENV, "LOADTRAINEDMODELSFROMFILE", "") == "true" - epsilonsvr_svmreg = PredictMD.load_model(epsilonsvr_svmreg_filename) -else - # Set up epsilon-SVR model - epsilonsvr_svmreg = PredictMD.singlelabeldataframesvmregression( - featurenames, - labelname; - package = :LIBSVMjl, - svmtype = LIBSVM.EpsilonSVR, - name = "SVM (epsilon-SVR)", - kernel = LIBSVM.Kernel.Linear, - verbose = false, - feature_contrasts = feature_contrasts, - ) -end - -if get(ENV, "LOADTRAINEDMODELSFROMFILE", "") == "true" -else - # Train epsilon-SVR model on training set - PredictMD.fit!(epsilonsvr_svmreg,training_features_df,training_labels_df,) -end - -# Plot true values versus predicted values for epsilon-SVR on training set -epsilonsvr_svmreg_plot_training = PredictMD.plotsinglelabelregressiontrueversuspredicted( - epsilonsvr_svmreg, - training_features_df, - training_labels_df, - labelname, - ) -PredictMD.open_plot(epsilonsvr_svmreg_plot_training) - -# Plot true values versus predicted values for epsilon-SVR on testing set -epsilonsvr_svmreg_plot_testing = PredictMD.plotsinglelabelregressiontrueversuspredicted( - epsilonsvr_svmreg, - testing_features_df, - testing_labels_df, - labelname, - ) -PredictMD.open_plot(epsilonsvr_svmreg_plot_testing) - -# Evaluate performance of epsilon-SVR on training set -PredictMD.singlelabelregressionmetrics( - epsilonsvr_svmreg, - training_features_df, - training_labels_df, - labelname, - ) - -# Evaluate performance of epsilon-SVR on testing set -PredictMD.singlelabelregressionmetrics( - epsilonsvr_svmreg, - testing_features_df, - testing_labels_df, - labelname, - ) - -############################################################################## -## Support vector machine (nu support vector regression) ################ -############################################################################## - -if get(ENV, "LOADTRAINEDMODELSFROMFILE", "") == "true" - nusvr_svmreg = PredictMD.load_model(nusvr_svmreg_filename) -else - # Set up nu-SVR model - nusvr_svmreg = PredictMD.singlelabeldataframesvmregression( - featurenames, - labelname; - package = :LIBSVMjl, - svmtype = LIBSVM.NuSVR, - name = "SVM (nu-SVR)", - kernel = LIBSVM.Kernel.Linear, - verbose = false, - feature_contrasts = feature_contrasts, - ) -end - -if get(ENV, "LOADTRAINEDMODELSFROMFILE", "") == "true" -else - # Train nu-SVR model - PredictMD.fit!(nusvr_svmreg,training_features_df,training_labels_df,) -end - -# Plot true values versus predicted values for nu-SVR on training set -nusvr_svmreg_plot_training = PredictMD.plotsinglelabelregressiontrueversuspredicted( - nusvr_svmreg, - training_features_df, - training_labels_df, - labelname, - ) -PredictMD.open_plot(nusvr_svmreg_plot_training) - -# Plot true values versus predicted values for nu-SVR on testing set -nusvr_svmreg_plot_testing = PredictMD.plotsinglelabelregressiontrueversuspredicted( - nusvr_svmreg, - testing_features_df, - testing_labels_df, - labelname, - ) -PredictMD.open_plot(nusvr_svmreg_plot_testing) - -# Evaluate performance of nu-SVR on training set -PredictMD.singlelabelregressionmetrics( - nusvr_svmreg, - training_features_df, - training_labels_df, - labelname, - ) - -# Evaluate performance of nu-SVR on testing set -PredictMD.singlelabelregressionmetrics( - nusvr_svmreg, - testing_features_df, - testing_labels_df, - labelname, - ) - ############################################################################## ## Multilayer perceptron (i.e. fully connected feedforward neural network) ### ############################################################################## From 6ff82cf3a55bf62c99e60c53084948800bf5a97f Mon Sep 17 00:00:00 2001 From: Dilum Aluthge Date: Sun, 20 May 2018 02:19:09 -0400 Subject: [PATCH 11/62] Remove all references to epsilon-SVR and nu-SVR --- test/cpu/functional/bostonhousing/run_bostonhousing.jl | 8 -------- 1 file changed, 8 deletions(-) diff --git a/test/cpu/functional/bostonhousing/run_bostonhousing.jl b/test/cpu/functional/bostonhousing/run_bostonhousing.jl index cb5c077cf..ee4e9a92d 100644 --- a/test/cpu/functional/bostonhousing/run_bostonhousing.jl +++ b/test/cpu/functional/bostonhousing/run_bostonhousing.jl @@ -407,8 +407,6 @@ PredictMD.singlelabelregressionmetrics( all_models = PredictMD.Fittable[ linearreg, randomforestreg, - epsilonsvr_svmreg, - nusvr_svmreg, knetmlpreg, ] @@ -437,8 +435,6 @@ showall(PredictMD.singlelabelregressionmetrics( if get(ENV, "SAVETRAINEDMODELSTOFILE", "") == "true" PredictMD.save_model(linearreg_filename, linearreg) PredictMD.save_model(randomforestreg_filename, randomforestreg) - PredictMD.save_model(epsilonsvr_svmreg_filename, epsilonsvr_svmreg) - PredictMD.save_model(nusvr_svmreg_filename, nusvr_svmreg) PredictMD.save_model(knetmlpreg_filename, knetmlpreg) end @@ -454,13 +450,9 @@ end # Get real-valued predictions from each model for training set PredictMD.predict(linearreg,training_features_df,) PredictMD.predict(randomforestreg,training_features_df,) -PredictMD.predict(epsilonsvr_svmreg,training_features_df,) -PredictMD.predict(nusvr_svmreg,training_features_df,) PredictMD.predict(knetmlpreg,training_features_df,) # Get real-valued predictions from each model for testing set PredictMD.predict(linearreg,testing_features_df,) PredictMD.predict(randomforestreg,testing_features_df,) -PredictMD.predict(epsilonsvr_svmreg,testing_features_df,) -PredictMD.predict(nusvr_svmreg,testing_features_df,) PredictMD.predict(knetmlpreg,testing_features_df,) From 9107ab2500b7d48b5883f4e91430a30a14381da8 Mon Sep 17 00:00:00 2001 From: Dilum Aluthge Date: Sun, 20 May 2018 02:26:11 -0400 Subject: [PATCH 12/62] Edit installation section --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index bfd4f9e2d..3efae6536 100644 --- a/README.md +++ b/README.md @@ -28,7 +28,7 @@ PredictMD is a [Julia](https://julialang.org/) package that provides a uniform i 1. Prerequisites - 2. Installation (recommended method) + 2. Installation 3. Examples @@ -104,7 +104,7 @@ If you receive an error (e.g. "command not found"), download and install pdf2svg * macOS: [http://brewinstall.org/Install-pdf2svg-on-Mac-with-Brew/](http://brewinstall.org/Install-pdf2svg-on-Mac-with-Brew/) * GNU/Linux: [https://github.com/dawbarton/pdf2svg](https://github.com/dawbarton/pdf2svg) -## 2. Installation (recommended method) +## 2. Installation **Step 1:** Make sure that you have followed all of the instructions in [Section 1 (Prerequisites)](#1-prerequisites). From 66d7894a325a2b338d1b9cad652c4a17154acf3b Mon Sep 17 00:00:00 2001 From: Dilum Aluthge Date: Sun, 20 May 2018 03:33:19 -0400 Subject: [PATCH 13/62] Rename "binaryclassclassifier" to "binaryclassifier" --- examples/breast_cancer_biopsy.jl | 64 +++++++++---------- src/PredictMD.jl | 2 +- .../singlelabelbinaryclassificationmetrics.jl | 16 ++--- src/plotting/plotprcurve.jl | 2 +- src/plotting/plotroccurve.jl | 2 +- ...tsinglelabelbinaryclassifierhistograms.jl} | 2 +- .../run_breastcancerbiopsy.jl | 56 ++++++++-------- 7 files changed, 72 insertions(+), 72 deletions(-) rename src/plotting/{plotsinglelabelbinaryclassclassifierhistograms.jl => plotsinglelabelbinaryclassifierhistograms.jl} (97%) diff --git a/examples/breast_cancer_biopsy.jl b/examples/breast_cancer_biopsy.jl index f03784592..972324a1e 100644 --- a/examples/breast_cancer_biopsy.jl +++ b/examples/breast_cancer_biopsy.jl @@ -177,7 +177,7 @@ end PredictMD.get_underlying(logisticclassifier) # Plot classifier histogram for logistic classifier on smoted training set -logistic_hist_training = PredictMD.plotsinglelabelbinaryclassclassifierhistogram( +logistic_hist_training = PredictMD.plotsinglelabelbinaryclassifierhistogram( logisticclassifier, smotedtraining_features_df, smotedtraininglabels_df, @@ -187,7 +187,7 @@ logistic_hist_training = PredictMD.plotsinglelabelbinaryclassclassifierhistogram PredictMD.open(logistic_hist_training) # Plot classifier histogram for logistic classifier on testing set -logistic_hist_testing = PredictMD.plotsinglelabelbinaryclassclassifierhistogram( +logistic_hist_testing = PredictMD.plotsinglelabelbinaryclassifierhistogram( logisticclassifier, testing_features_df, testing_labels_df, @@ -197,7 +197,7 @@ logistic_hist_testing = PredictMD.plotsinglelabelbinaryclassclassifierhistogram( PredictMD.open(logistic_hist_testing) # Evaluate performance of logistic classifier on smoted training set -PredictMD.singlelabelbinaryclassclassificationmetrics( +PredictMD.singlelabelbinaryclassificationmetrics( logisticclassifier, testing_features_df, testing_labels_df, @@ -207,7 +207,7 @@ PredictMD.singlelabelbinaryclassclassificationmetrics( ) # Evaluate performance of logistic classifier on testing set -PredictMD.singlelabelbinaryclassclassificationmetrics( +PredictMD.singlelabelbinaryclassificationmetrics( logisticclassifier, testing_features_df, testing_labels_df, @@ -247,7 +247,7 @@ end PredictMD.get_underlying(probitclassifier) # Plot classifier histogram for probit classifier on smoted training set -probitclassifier_hist_training = PredictMD.plotsinglelabelbinaryclassclassifierhistogram( +probitclassifier_hist_training = PredictMD.plotsinglelabelbinaryclassifierhistogram( probitclassifier, smotedtraining_features_df, smotedtraininglabels_df, @@ -257,7 +257,7 @@ probitclassifier_hist_training = PredictMD.plotsinglelabelbinaryclassclassifierh PredictMD.open(probitclassifier_hist_training) # Plot classifier histogram for probit classifier on testing set -probitclassifier_hist_testing = PredictMD.plotsinglelabelbinaryclassclassifierhistogram( +probitclassifier_hist_testing = PredictMD.plotsinglelabelbinaryclassifierhistogram( probitclassifier, testing_features_df, testing_labels_df, @@ -267,7 +267,7 @@ probitclassifier_hist_testing = PredictMD.plotsinglelabelbinaryclassclassifierhi PredictMD.open(probitclassifier_hist_testing) # Evaluate performance of probit classifier on smoted training set -PredictMD.singlelabelbinaryclassclassificationmetrics( +PredictMD.singlelabelbinaryclassificationmetrics( probitclassifier, smotedtraining_features_df, smotedtraininglabels_df, @@ -277,7 +277,7 @@ PredictMD.singlelabelbinaryclassclassificationmetrics( ) # Evaluate performance of probit classifier on testing set -PredictMD.singlelabelbinaryclassclassificationmetrics( +PredictMD.singlelabelbinaryclassificationmetrics( probitclassifier, testing_features_df, testing_labels_df, @@ -315,7 +315,7 @@ else end # Plot classifier histogram for random forest classifier on smoted training set -rfclassifier_hist_training = PredictMD.plotsinglelabelbinaryclassclassifierhistogram( +rfclassifier_hist_training = PredictMD.plotsinglelabelbinaryclassifierhistogram( rfclassifier, smotedtraining_features_df, smotedtraininglabels_df, @@ -325,7 +325,7 @@ rfclassifier_hist_training = PredictMD.plotsinglelabelbinaryclassclassifierhisto PredictMD.open(rfclassifier_hist_training) # Plot classifier histogram for random forest classifier on testing set -rfclassifier_hist_testing = PredictMD.plotsinglelabelbinaryclassclassifierhistogram( +rfclassifier_hist_testing = PredictMD.plotsinglelabelbinaryclassifierhistogram( rfclassifier, testing_features_df, testing_labels_df, @@ -335,7 +335,7 @@ rfclassifier_hist_testing = PredictMD.plotsinglelabelbinaryclassclassifierhistog PredictMD.open(rfclassifier_hist_testing) # Evaluate performance of random forest classifier on smoted training set -PredictMD.singlelabelbinaryclassclassificationmetrics( +PredictMD.singlelabelbinaryclassificationmetrics( rfclassifier, smotedtraining_features_df, smotedtraininglabels_df, @@ -345,7 +345,7 @@ PredictMD.singlelabelbinaryclassclassificationmetrics( ) # Evaluate performance of random forest on testing set -PredictMD.singlelabelbinaryclassclassificationmetrics( +PredictMD.singlelabelbinaryclassificationmetrics( rfclassifier, testing_features_df, testing_labels_df, @@ -383,7 +383,7 @@ else end # Plot classifier histogram for C-SVC on smoted training set -csvc_svmclassifier_hist_training = PredictMD.plotsinglelabelbinaryclassclassifierhistogram( +csvc_svmclassifier_hist_training = PredictMD.plotsinglelabelbinaryclassifierhistogram( csvc_svmclassifier, smotedtraining_features_df, smotedtraininglabels_df, @@ -393,7 +393,7 @@ csvc_svmclassifier_hist_training = PredictMD.plotsinglelabelbinaryclassclassifie PredictMD.open(csvc_svmclassifier_hist_training) # Plot classifier histogram for C-SVC on testing set -csvc_svmclassifier_hist_testing = PredictMD.plotsinglelabelbinaryclassclassifierhistogram( +csvc_svmclassifier_hist_testing = PredictMD.plotsinglelabelbinaryclassifierhistogram( csvc_svmclassifier, testing_features_df, testing_labels_df, @@ -403,7 +403,7 @@ csvc_svmclassifier_hist_testing = PredictMD.plotsinglelabelbinaryclassclassifier PredictMD.open(csvc_svmclassifier_hist_testing) # Evaluate performance of C-SVC on smoted training set -PredictMD.singlelabelbinaryclassclassificationmetrics( +PredictMD.singlelabelbinaryclassificationmetrics( csvc_svmclassifier, smotedtraining_features_df, smotedtraininglabels_df, @@ -413,7 +413,7 @@ PredictMD.singlelabelbinaryclassclassificationmetrics( ) # Evaluate performance of C-SVC on testing set -PredictMD.singlelabelbinaryclassclassificationmetrics( +PredictMD.singlelabelbinaryclassificationmetrics( csvc_svmclassifier, testing_features_df, testing_labels_df, @@ -451,7 +451,7 @@ else end # Plot classifier histogram for nu-SVC on smoted training set -nusvc_svmclassifier_hist_training = PredictMD.plotsinglelabelbinaryclassclassifierhistogram( +nusvc_svmclassifier_hist_training = PredictMD.plotsinglelabelbinaryclassifierhistogram( nusvc_svmclassifier, smotedtraining_features_df, smotedtraininglabels_df, @@ -461,7 +461,7 @@ nusvc_svmclassifier_hist_training = PredictMD.plotsinglelabelbinaryclassclassifi PredictMD.open(nusvc_svmclassifier_hist_training) # Plot classifier histogram for nu-SVC on testing set -nusvc_svmclassifier_hist_testing = PredictMD.plotsinglelabelbinaryclassclassifierhistogram( +nusvc_svmclassifier_hist_testing = PredictMD.plotsinglelabelbinaryclassifierhistogram( nusvc_svmclassifier, testing_features_df, testing_labels_df, @@ -471,7 +471,7 @@ nusvc_svmclassifier_hist_testing = PredictMD.plotsinglelabelbinaryclassclassifie PredictMD.open(nusvc_svmclassifier_hist_testing) # Evaluate performance of nu-SVC on smoted training set -PredictMD.singlelabelbinaryclassclassificationmetrics( +PredictMD.singlelabelbinaryclassificationmetrics( nusvc_svmclassifier, smotedtraining_features_df, smotedtraininglabels_df, @@ -481,7 +481,7 @@ PredictMD.singlelabelbinaryclassclassificationmetrics( ) # Evaluate performance of SVM on testing set -PredictMD.singlelabelbinaryclassclassificationmetrics( +PredictMD.singlelabelbinaryclassificationmetrics( nusvc_svmclassifier, testing_features_df, testing_labels_df, @@ -662,7 +662,7 @@ knet_learningcurve_lossvsiteration_skip100iterations = PredictMD.plotlearningcur PredictMD.open(knet_learningcurve_lossvsiteration_skip100iterations) # Plot classifier histogram for multilayer perceptron on smoted training set -knetmlpclassifier_hist_training = PredictMD.plotsinglelabelbinaryclassclassifierhistogram( +knetmlpclassifier_hist_training = PredictMD.plotsinglelabelbinaryclassifierhistogram( knetmlpclassifier, smotedtraining_features_df, smotedtraininglabels_df, @@ -672,7 +672,7 @@ knetmlpclassifier_hist_training = PredictMD.plotsinglelabelbinaryclassclassifier PredictMD.open(knetmlpclassifier_hist_training) # Plot classifier histogram for multilayer perceptron on testing set -knetmlpclassifier_hist_testing = PredictMD.plotsinglelabelbinaryclassclassifierhistogram( +knetmlpclassifier_hist_testing = PredictMD.plotsinglelabelbinaryclassifierhistogram( knetmlpclassifier, testing_features_df, testing_labels_df, @@ -682,7 +682,7 @@ knetmlpclassifier_hist_testing = PredictMD.plotsinglelabelbinaryclassclassifierh PredictMD.open(knetmlpclassifier_hist_testing) # Evaluate performance of multilayer perceptron on smoted training set -PredictMD.singlelabelbinaryclassclassificationmetrics( +PredictMD.singlelabelbinaryclassificationmetrics( knetmlpclassifier, smotedtraining_features_df, smotedtraininglabels_df, @@ -692,7 +692,7 @@ PredictMD.singlelabelbinaryclassclassificationmetrics( ) # Evaluate performance of multilayer perceptron on testing set -PredictMD.singlelabelbinaryclassclassificationmetrics( +PredictMD.singlelabelbinaryclassificationmetrics( knetmlpclassifier, testing_features_df, testing_labels_df, @@ -708,7 +708,7 @@ PredictMD.singlelabelbinaryclassclassificationmetrics( ############################################################################## # Compare performance of all models on smoted training set -showall(PredictMD.singlelabelbinaryclassclassificationmetrics( +showall(PredictMD.singlelabelbinaryclassificationmetrics( [ logisticclassifier, probitclassifier, @@ -723,7 +723,7 @@ showall(PredictMD.singlelabelbinaryclassclassificationmetrics( positiveclass; sensitivity = 0.95, )) -showall(PredictMD.singlelabelbinaryclassclassificationmetrics( +showall(PredictMD.singlelabelbinaryclassificationmetrics( [ logisticclassifier, probitclassifier, @@ -738,7 +738,7 @@ showall(PredictMD.singlelabelbinaryclassclassificationmetrics( positiveclass; specificity = 0.95, )) -showall(PredictMD.singlelabelbinaryclassclassificationmetrics( +showall(PredictMD.singlelabelbinaryclassificationmetrics( [ logisticclassifier, probitclassifier, @@ -753,7 +753,7 @@ showall(PredictMD.singlelabelbinaryclassclassificationmetrics( positiveclass; maximize = :f1score, )) -showall(PredictMD.singlelabelbinaryclassclassificationmetrics( +showall(PredictMD.singlelabelbinaryclassificationmetrics( [ logisticclassifier, probitclassifier, @@ -770,7 +770,7 @@ showall(PredictMD.singlelabelbinaryclassclassificationmetrics( )) # Compare performance of all models on testing set -showall(PredictMD.singlelabelbinaryclassclassificationmetrics( +showall(PredictMD.singlelabelbinaryclassificationmetrics( [ logisticclassifier, probitclassifier, @@ -785,7 +785,7 @@ showall(PredictMD.singlelabelbinaryclassclassificationmetrics( positiveclass; sensitivity = 0.95, )) -showall(PredictMD.singlelabelbinaryclassclassificationmetrics( +showall(PredictMD.singlelabelbinaryclassificationmetrics( [ logisticclassifier, probitclassifier, @@ -800,7 +800,7 @@ showall(PredictMD.singlelabelbinaryclassclassificationmetrics( positiveclass; specificity = 0.95, )) -showall(PredictMD.singlelabelbinaryclassclassificationmetrics( +showall(PredictMD.singlelabelbinaryclassificationmetrics( [ logisticclassifier, probitclassifier, @@ -815,7 +815,7 @@ showall(PredictMD.singlelabelbinaryclassclassificationmetrics( positiveclass; maximize = :f1score, )) -showall(PredictMD.singlelabelbinaryclassclassificationmetrics( +showall(PredictMD.singlelabelbinaryclassificationmetrics( [ logisticclassifier, probitclassifier, diff --git a/src/PredictMD.jl b/src/PredictMD.jl index 420a9ab95..626fc37f7 100644 --- a/src/PredictMD.jl +++ b/src/PredictMD.jl @@ -70,7 +70,7 @@ include("plotting/plotlearningcurve.jl") include("plotting/plotprcurve.jl") include("plotting/plotroccurve.jl") include("plotting/plotsinglelabelregressiontruevspredicted.jl") -include("plotting/plotsinglelabelbinaryclassclassifierhistograms.jl") +include("plotting/plotsinglelabelbinaryclassifierhistograms.jl") include("plotting/probability_calibration_plots.jl") # postprocessing/ diff --git a/src/metrics/singlelabelbinaryclassificationmetrics.jl b/src/metrics/singlelabelbinaryclassificationmetrics.jl index bb3fefe5e..fce8e3c0f 100644 --- a/src/metrics/singlelabelbinaryclassificationmetrics.jl +++ b/src/metrics/singlelabelbinaryclassificationmetrics.jl @@ -26,7 +26,7 @@ function singlelabelbinaryyscore( return result end -function _singlelabelbinaryclassclassificationmetrics_tunableparam( +function _singlelabelbinaryclassificationmetrics_tunableparam( kwargsassoc::Associative, ) tunableparams = [ @@ -102,7 +102,7 @@ function _singlelabelbinaryclassclassificationmetrics_tunableparam( return selectedtunableparam, selectedparamtomax, metricprintnames end -function _singlelabelbinaryclassclassificationmetrics( +function _singlelabelbinaryclassificationmetrics( estimator::Fittable, features_df::DataFrames.AbstractDataFrame, labels_df::DataFrames.AbstractDataFrame, @@ -114,7 +114,7 @@ function _singlelabelbinaryclassclassificationmetrics( kwargsdict = Dict(kwargs) kwargsdict = fix_dict_type(kwargsdict) selectedtunableparam, selectedparamtomax, metricprintnames = - _singlelabelbinaryclassclassificationmetrics_tunableparam(kwargsdict) + _singlelabelbinaryclassificationmetrics_tunableparam(kwargsdict) # predictedprobabilitiesalllabels = predict_proba(estimator, features_df) yscore = Cfloat.( @@ -189,7 +189,7 @@ function _singlelabelbinaryclassclassificationmetrics( return results end -function singlelabelbinaryclassclassificationmetrics( +function singlelabelbinaryclassificationmetrics( estimator::Fittable, features_df::DataFrames.AbstractDataFrame, labels_df::DataFrames.AbstractDataFrame, @@ -198,7 +198,7 @@ function singlelabelbinaryclassclassificationmetrics( kwargs... ) vectorofestimators = Fittable[estimator] - result = singlelabelbinaryclassclassificationmetrics( + result = singlelabelbinaryclassificationmetrics( vectorofestimators, features_df, labels_df, @@ -209,7 +209,7 @@ function singlelabelbinaryclassclassificationmetrics( return result end -function singlelabelbinaryclassclassificationmetrics( +function singlelabelbinaryclassificationmetrics( vectorofestimators::AbstractVector{Fittable}, features_df::DataFrames.AbstractDataFrame, labels_df::DataFrames.AbstractDataFrame, @@ -220,9 +220,9 @@ function singlelabelbinaryclassclassificationmetrics( kwargsdict = Dict(kwargs) kwargsdict = fix_dict_type(kwargsdict) selectedtunableparam, selectedparamtomax, metricprintnames = - _singlelabelbinaryclassclassificationmetrics_tunableparam(kwargsdict) + _singlelabelbinaryclassificationmetrics_tunableparam(kwargsdict) metricsforeachestimator = [ - _singlelabelbinaryclassclassificationmetrics( + _singlelabelbinaryclassificationmetrics( est, features_df, labels_df, diff --git a/src/plotting/plotprcurve.jl b/src/plotting/plotprcurve.jl index 4e862b94d..3927c101c 100644 --- a/src/plotting/plotprcurve.jl +++ b/src/plotting/plotprcurve.jl @@ -33,7 +33,7 @@ function plotprcurve( alllinearplotobjects = [] for i = 1:length(vectorofestimators) estimator_i = vectorofestimators[i] - metrics_i = _singlelabelbinaryclassclassificationmetrics( + metrics_i = _singlelabelbinaryclassificationmetrics( estimator_i, features_df, labels_df, diff --git a/src/plotting/plotroccurve.jl b/src/plotting/plotroccurve.jl index 0622da28c..0b5c0b6c0 100644 --- a/src/plotting/plotroccurve.jl +++ b/src/plotting/plotroccurve.jl @@ -33,7 +33,7 @@ function plotroccurve( alllinearplotobjects = [] for i = 1:length(vectorofestimators) estimator_i = vectorofestimators[i] - metrics_i = _singlelabelbinaryclassclassificationmetrics( + metrics_i = _singlelabelbinaryclassificationmetrics( estimator_i, features_df, labels_df, diff --git a/src/plotting/plotsinglelabelbinaryclassclassifierhistograms.jl b/src/plotting/plotsinglelabelbinaryclassifierhistograms.jl similarity index 97% rename from src/plotting/plotsinglelabelbinaryclassclassifierhistograms.jl rename to src/plotting/plotsinglelabelbinaryclassifierhistograms.jl index f8798efd7..348f10edc 100644 --- a/src/plotting/plotsinglelabelbinaryclassclassifierhistograms.jl +++ b/src/plotting/plotsinglelabelbinaryclassifierhistograms.jl @@ -2,7 +2,7 @@ import LaTeXStrings import PGFPlots import PGFPlotsX -function plotsinglelabelbinaryclassclassifierhistogram( +function plotsinglelabelbinaryclassifierhistogram( estimator::Fittable, features_df::DataFrames.AbstractDataFrame, labels_df::DataFrames.AbstractDataFrame, diff --git a/test/cpu/functional/breastcancerbiopsy/run_breastcancerbiopsy.jl b/test/cpu/functional/breastcancerbiopsy/run_breastcancerbiopsy.jl index 5cd202640..829537a75 100644 --- a/test/cpu/functional/breastcancerbiopsy/run_breastcancerbiopsy.jl +++ b/test/cpu/functional/breastcancerbiopsy/run_breastcancerbiopsy.jl @@ -160,7 +160,7 @@ end PredictMD.get_underlying(logisticclassifier) # Plot classifier histogram for logistic classifier on smoted training set -logistic_hist_training = PredictMD.plotsinglelabelbinaryclassclassifierhistogram( +logistic_hist_training = PredictMD.plotsinglelabelbinaryclassifierhistogram( logisticclassifier, smoted_training_features_df, smoted_training_labels_df, @@ -170,7 +170,7 @@ logistic_hist_training = PredictMD.plotsinglelabelbinaryclassclassifierhistogram PredictMD.open_plot(logistic_hist_training) # Plot classifier histogram for logistic classifier on testing set -logistic_hist_testing = PredictMD.plotsinglelabelbinaryclassclassifierhistogram( +logistic_hist_testing = PredictMD.plotsinglelabelbinaryclassifierhistogram( logisticclassifier, testing_features_df, testing_labels_df, @@ -180,7 +180,7 @@ logistic_hist_testing = PredictMD.plotsinglelabelbinaryclassclassifierhistogram( PredictMD.open_plot(logistic_hist_testing) # Evaluate performance of logistic classifier on smoted training set -PredictMD.singlelabelbinaryclassclassificationmetrics( +PredictMD.singlelabelbinaryclassificationmetrics( logisticclassifier, testing_features_df, testing_labels_df, @@ -190,7 +190,7 @@ PredictMD.singlelabelbinaryclassclassificationmetrics( ) # Evaluate performance of logistic classifier on testing set -PredictMD.singlelabelbinaryclassclassificationmetrics( +PredictMD.singlelabelbinaryclassificationmetrics( logisticclassifier, testing_features_df, testing_labels_df, @@ -282,7 +282,7 @@ else end # Plot classifier histogram for random forest classifier on smoted training set -rfclassifier_hist_training = PredictMD.plotsinglelabelbinaryclassclassifierhistogram( +rfclassifier_hist_training = PredictMD.plotsinglelabelbinaryclassifierhistogram( rfclassifier, smoted_training_features_df, smoted_training_labels_df, @@ -292,7 +292,7 @@ rfclassifier_hist_training = PredictMD.plotsinglelabelbinaryclassclassifierhisto PredictMD.open_plot(rfclassifier_hist_training) # Plot classifier histogram for random forest classifier on testing set -rfclassifier_hist_testing = PredictMD.plotsinglelabelbinaryclassclassifierhistogram( +rfclassifier_hist_testing = PredictMD.plotsinglelabelbinaryclassifierhistogram( rfclassifier, testing_features_df, testing_labels_df, @@ -302,7 +302,7 @@ rfclassifier_hist_testing = PredictMD.plotsinglelabelbinaryclassclassifierhistog PredictMD.open_plot(rfclassifier_hist_testing) # Evaluate performance of random forest classifier on smoted training set -PredictMD.singlelabelbinaryclassclassificationmetrics( +PredictMD.singlelabelbinaryclassificationmetrics( rfclassifier, smoted_training_features_df, smoted_training_labels_df, @@ -312,7 +312,7 @@ PredictMD.singlelabelbinaryclassclassificationmetrics( ) # Evaluate performance of random forest on testing set -PredictMD.singlelabelbinaryclassclassificationmetrics( +PredictMD.singlelabelbinaryclassificationmetrics( rfclassifier, testing_features_df, testing_labels_df, @@ -362,7 +362,7 @@ else end # Plot classifier histogram for C-SVC on smoted training set -csvc_svmclassifier_hist_training = PredictMD.plotsinglelabelbinaryclassclassifierhistogram( +csvc_svmclassifier_hist_training = PredictMD.plotsinglelabelbinaryclassifierhistogram( csvc_svmclassifier, smoted_training_features_df, smoted_training_labels_df, @@ -372,7 +372,7 @@ csvc_svmclassifier_hist_training = PredictMD.plotsinglelabelbinaryclassclassifie PredictMD.open_plot(csvc_svmclassifier_hist_training) # Plot classifier histogram for C-SVC on testing set -csvc_svmclassifier_hist_testing = PredictMD.plotsinglelabelbinaryclassclassifierhistogram( +csvc_svmclassifier_hist_testing = PredictMD.plotsinglelabelbinaryclassifierhistogram( csvc_svmclassifier, testing_features_df, testing_labels_df, @@ -382,7 +382,7 @@ csvc_svmclassifier_hist_testing = PredictMD.plotsinglelabelbinaryclassclassifier PredictMD.open_plot(csvc_svmclassifier_hist_testing) # Evaluate performance of C-SVC on smoted training set -PredictMD.singlelabelbinaryclassclassificationmetrics( +PredictMD.singlelabelbinaryclassificationmetrics( csvc_svmclassifier, smoted_training_features_df, smoted_training_labels_df, @@ -392,7 +392,7 @@ PredictMD.singlelabelbinaryclassclassificationmetrics( ) # Evaluate performance of C-SVC on testing set -PredictMD.singlelabelbinaryclassclassificationmetrics( +PredictMD.singlelabelbinaryclassificationmetrics( csvc_svmclassifier, testing_features_df, testing_labels_df, @@ -432,7 +432,7 @@ else end # Plot classifier histogram for nu-SVC on smoted training set -nusvc_svmclassifier_hist_training = PredictMD.plotsinglelabelbinaryclassclassifierhistogram( +nusvc_svmclassifier_hist_training = PredictMD.plotsinglelabelbinaryclassifierhistogram( nusvc_svmclassifier, smoted_training_features_df, smoted_training_labels_df, @@ -442,7 +442,7 @@ nusvc_svmclassifier_hist_training = PredictMD.plotsinglelabelbinaryclassclassifi PredictMD.open_plot(nusvc_svmclassifier_hist_training) # Plot classifier histogram for nu-SVC on testing set -nusvc_svmclassifier_hist_testing = PredictMD.plotsinglelabelbinaryclassclassifierhistogram( +nusvc_svmclassifier_hist_testing = PredictMD.plotsinglelabelbinaryclassifierhistogram( nusvc_svmclassifier, testing_features_df, testing_labels_df, @@ -452,7 +452,7 @@ nusvc_svmclassifier_hist_testing = PredictMD.plotsinglelabelbinaryclassclassifie PredictMD.open_plot(nusvc_svmclassifier_hist_testing) # Evaluate performance of nu-SVC on smoted training set -PredictMD.singlelabelbinaryclassclassificationmetrics( +PredictMD.singlelabelbinaryclassificationmetrics( nusvc_svmclassifier, smoted_training_features_df, smoted_training_labels_df, @@ -462,7 +462,7 @@ PredictMD.singlelabelbinaryclassclassificationmetrics( ) # Evaluate performance of SVM on testing set -PredictMD.singlelabelbinaryclassclassificationmetrics( +PredictMD.singlelabelbinaryclassificationmetrics( nusvc_svmclassifier, testing_features_df, testing_labels_df, @@ -640,7 +640,7 @@ knet_learningcurve_lossvsiteration_skip100iterations = PredictMD.plotlearningcur PredictMD.open_plot(knet_learningcurve_lossvsiteration_skip100iterations) # Plot classifier histogram for multilayer perceptron on smoted training set -knetmlpclassifier_hist_training = PredictMD.plotsinglelabelbinaryclassclassifierhistogram( +knetmlpclassifier_hist_training = PredictMD.plotsinglelabelbinaryclassifierhistogram( knetmlpclassifier, smoted_training_features_df, smoted_training_labels_df, @@ -650,7 +650,7 @@ knetmlpclassifier_hist_training = PredictMD.plotsinglelabelbinaryclassclassifier PredictMD.open_plot(knetmlpclassifier_hist_training) # Plot classifier histogram for multilayer perceptron on testing set -knetmlpclassifier_hist_testing = PredictMD.plotsinglelabelbinaryclassclassifierhistogram( +knetmlpclassifier_hist_testing = PredictMD.plotsinglelabelbinaryclassifierhistogram( knetmlpclassifier, testing_features_df, testing_labels_df, @@ -660,7 +660,7 @@ knetmlpclassifier_hist_testing = PredictMD.plotsinglelabelbinaryclassclassifierh PredictMD.open_plot(knetmlpclassifier_hist_testing) # Evaluate performance of multilayer perceptron on smoted training set -PredictMD.singlelabelbinaryclassclassificationmetrics( +PredictMD.singlelabelbinaryclassificationmetrics( knetmlpclassifier, smoted_training_features_df, smoted_training_labels_df, @@ -670,7 +670,7 @@ PredictMD.singlelabelbinaryclassclassificationmetrics( ) # Evaluate performance of multilayer perceptron on testing set -PredictMD.singlelabelbinaryclassclassificationmetrics( +PredictMD.singlelabelbinaryclassificationmetrics( knetmlpclassifier, testing_features_df, testing_labels_df, @@ -694,7 +694,7 @@ all_models = PredictMD.Fittable[ ] # Compare performance of all models on smoted training set -showall(PredictMD.singlelabelbinaryclassclassificationmetrics( +showall(PredictMD.singlelabelbinaryclassificationmetrics( all_models, training_features_df, training_labels_df, @@ -702,7 +702,7 @@ showall(PredictMD.singlelabelbinaryclassclassificationmetrics( positiveclass; sensitivity = 0.95, )) -showall(PredictMD.singlelabelbinaryclassclassificationmetrics( +showall(PredictMD.singlelabelbinaryclassificationmetrics( all_models, training_features_df, training_labels_df, @@ -710,7 +710,7 @@ showall(PredictMD.singlelabelbinaryclassclassificationmetrics( positiveclass; specificity = 0.95, )) -showall(PredictMD.singlelabelbinaryclassclassificationmetrics( +showall(PredictMD.singlelabelbinaryclassificationmetrics( all_models, training_features_df, training_labels_df, @@ -718,7 +718,7 @@ showall(PredictMD.singlelabelbinaryclassclassificationmetrics( positiveclass; maximize = :f1score, )) -showall(PredictMD.singlelabelbinaryclassclassificationmetrics( +showall(PredictMD.singlelabelbinaryclassificationmetrics( all_models, training_features_df, training_labels_df, @@ -728,7 +728,7 @@ showall(PredictMD.singlelabelbinaryclassclassificationmetrics( )) # Compare performance of all models on testing set -showall(PredictMD.singlelabelbinaryclassclassificationmetrics( +showall(PredictMD.singlelabelbinaryclassificationmetrics( all_models, testing_features_df, testing_labels_df, @@ -736,7 +736,7 @@ showall(PredictMD.singlelabelbinaryclassclassificationmetrics( positiveclass; sensitivity = 0.95, )) -showall(PredictMD.singlelabelbinaryclassclassificationmetrics( +showall(PredictMD.singlelabelbinaryclassificationmetrics( all_models, testing_features_df, testing_labels_df, @@ -744,7 +744,7 @@ showall(PredictMD.singlelabelbinaryclassclassificationmetrics( positiveclass; specificity = 0.95, )) -showall(PredictMD.singlelabelbinaryclassclassificationmetrics( +showall(PredictMD.singlelabelbinaryclassificationmetrics( all_models, testing_features_df, testing_labels_df, @@ -752,7 +752,7 @@ showall(PredictMD.singlelabelbinaryclassclassificationmetrics( positiveclass; maximize = :f1score, )) -showall(PredictMD.singlelabelbinaryclassclassificationmetrics( +showall(PredictMD.singlelabelbinaryclassificationmetrics( all_models, testing_features_df, testing_labels_df, From 6d56b7c68a75be7084b42a9cadc4ff56e666e94c Mon Sep 17 00:00:00 2001 From: Dilum Aluthge Date: Sun, 20 May 2018 22:02:42 -0400 Subject: [PATCH 14/62] Move "examples" folder to "examples_old" (since these examples are really outdated) --- {examples => examples_old}/boston_housing.jl | 0 .../boston_housing/boston_housing_linear_regression.ipynb | 0 .../boston_housing/boston_housing_linear_regression.jl | 0 .../boston_housing/boston_housing_metric_comparison.ipynb | 0 .../boston_housing/boston_housing_metric_comparison.jl | 0 .../boston_housing/boston_housing_mlp.ipynb | 0 {examples => examples_old}/boston_housing/boston_housing_mlp.jl | 0 .../boston_housing/boston_housing_random_forest.ipynb | 0 .../boston_housing/boston_housing_random_forest.jl | 0 .../boston_housing/boston_housing_svm.ipynb | 0 {examples => examples_old}/boston_housing/boston_housing_svm.jl | 0 {examples => examples_old}/breast_cancer_biopsy.jl | 0 12 files changed, 0 insertions(+), 0 deletions(-) rename {examples => examples_old}/boston_housing.jl (100%) rename {examples => examples_old}/boston_housing/boston_housing_linear_regression.ipynb (100%) rename {examples => examples_old}/boston_housing/boston_housing_linear_regression.jl (100%) rename {examples => examples_old}/boston_housing/boston_housing_metric_comparison.ipynb (100%) rename {examples => examples_old}/boston_housing/boston_housing_metric_comparison.jl (100%) rename {examples => examples_old}/boston_housing/boston_housing_mlp.ipynb (100%) rename {examples => examples_old}/boston_housing/boston_housing_mlp.jl (100%) rename {examples => examples_old}/boston_housing/boston_housing_random_forest.ipynb (100%) rename {examples => examples_old}/boston_housing/boston_housing_random_forest.jl (100%) rename {examples => examples_old}/boston_housing/boston_housing_svm.ipynb (100%) rename {examples => examples_old}/boston_housing/boston_housing_svm.jl (100%) rename {examples => examples_old}/breast_cancer_biopsy.jl (100%) diff --git a/examples/boston_housing.jl b/examples_old/boston_housing.jl similarity index 100% rename from examples/boston_housing.jl rename to examples_old/boston_housing.jl diff --git a/examples/boston_housing/boston_housing_linear_regression.ipynb b/examples_old/boston_housing/boston_housing_linear_regression.ipynb similarity index 100% rename from examples/boston_housing/boston_housing_linear_regression.ipynb rename to examples_old/boston_housing/boston_housing_linear_regression.ipynb diff --git a/examples/boston_housing/boston_housing_linear_regression.jl b/examples_old/boston_housing/boston_housing_linear_regression.jl similarity index 100% rename from examples/boston_housing/boston_housing_linear_regression.jl rename to examples_old/boston_housing/boston_housing_linear_regression.jl diff --git a/examples/boston_housing/boston_housing_metric_comparison.ipynb b/examples_old/boston_housing/boston_housing_metric_comparison.ipynb similarity index 100% rename from examples/boston_housing/boston_housing_metric_comparison.ipynb rename to examples_old/boston_housing/boston_housing_metric_comparison.ipynb diff --git a/examples/boston_housing/boston_housing_metric_comparison.jl b/examples_old/boston_housing/boston_housing_metric_comparison.jl similarity index 100% rename from examples/boston_housing/boston_housing_metric_comparison.jl rename to examples_old/boston_housing/boston_housing_metric_comparison.jl diff --git a/examples/boston_housing/boston_housing_mlp.ipynb b/examples_old/boston_housing/boston_housing_mlp.ipynb similarity index 100% rename from examples/boston_housing/boston_housing_mlp.ipynb rename to examples_old/boston_housing/boston_housing_mlp.ipynb diff --git a/examples/boston_housing/boston_housing_mlp.jl b/examples_old/boston_housing/boston_housing_mlp.jl similarity index 100% rename from examples/boston_housing/boston_housing_mlp.jl rename to examples_old/boston_housing/boston_housing_mlp.jl diff --git a/examples/boston_housing/boston_housing_random_forest.ipynb b/examples_old/boston_housing/boston_housing_random_forest.ipynb similarity index 100% rename from examples/boston_housing/boston_housing_random_forest.ipynb rename to examples_old/boston_housing/boston_housing_random_forest.ipynb diff --git a/examples/boston_housing/boston_housing_random_forest.jl b/examples_old/boston_housing/boston_housing_random_forest.jl similarity index 100% rename from examples/boston_housing/boston_housing_random_forest.jl rename to examples_old/boston_housing/boston_housing_random_forest.jl diff --git a/examples/boston_housing/boston_housing_svm.ipynb b/examples_old/boston_housing/boston_housing_svm.ipynb similarity index 100% rename from examples/boston_housing/boston_housing_svm.ipynb rename to examples_old/boston_housing/boston_housing_svm.ipynb diff --git a/examples/boston_housing/boston_housing_svm.jl b/examples_old/boston_housing/boston_housing_svm.jl similarity index 100% rename from examples/boston_housing/boston_housing_svm.jl rename to examples_old/boston_housing/boston_housing_svm.jl diff --git a/examples/breast_cancer_biopsy.jl b/examples_old/breast_cancer_biopsy.jl similarity index 100% rename from examples/breast_cancer_biopsy.jl rename to examples_old/breast_cancer_biopsy.jl From c6dbbcf1fdc51aef70820fa09a5154bc9bfb6440 Mon Sep 17 00:00:00 2001 From: Dilum Aluthge Date: Mon, 21 May 2018 00:04:16 -0400 Subject: [PATCH 15/62] Update .gitignore --- .gitignore | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index df0f2ea89..4fe840da5 100644 --- a/.gitignore +++ b/.gitignore @@ -10,7 +10,8 @@ deps/custom_preamble.tex deps/deps.jl deps/pdf2svg.svg deps/showed_warning -docs/build +docs/build/ +docs/site/ input/ output/ scratch.jl From dd8446aedd9e6388a52ea98dac10238e443c8912 Mon Sep 17 00:00:00 2001 From: Dilum Aluthge Date: Mon, 21 May 2018 02:17:04 -0400 Subject: [PATCH 16/62] Update gitignore --- .gitignore | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) diff --git a/.gitignore b/.gitignore index 4fe840da5..baa48b7d7 100644 --- a/.gitignore +++ b/.gitignore @@ -1,17 +1,29 @@ -.DS_Store *.aux + *.ipynb_checkpoints/ + *.jl.*.cov + *.jl.cov + *.jl.mem + *.log + +.DS_Store + data/ + deps/custom_preamble.tex + deps/deps.jl + deps/pdf2svg.svg + deps/showed_warning -docs/build/ -docs/site/ + input/ + output/ + scratch.jl From 11d71eb4d601fe13d9b458b446fd968a13b313f5 Mon Sep 17 00:00:00 2001 From: Dilum Aluthge Date: Mon, 21 May 2018 02:23:22 -0400 Subject: [PATCH 17/62] Reorganize .gitignore --- .gitignore | 28 +++++++++++----------------- 1 file changed, 11 insertions(+), 17 deletions(-) diff --git a/.gitignore b/.gitignore index baa48b7d7..2c762178b 100644 --- a/.gitignore +++ b/.gitignore @@ -1,29 +1,23 @@ -*.aux +# Directories to ignore: *.ipynb_checkpoints/ +data/ +docs/build/ +docs/site/ +input/ +output/ -*.jl.*.cov +# Files to ignore: +.DS_Store +*.aux *.jl.cov - +*.jl.*.cov *.jl.mem - *.log - -.DS_Store - -data/ - deps/custom_preamble.tex - deps/deps.jl - deps/pdf2svg.svg - deps/showed_warning - -input/ - -output/ - scratch.jl + From da379fa882aec389649dfbc4990e0234c0b55054 Mon Sep 17 00:00:00 2001 From: Dilum Aluthge Date: Mon, 21 May 2018 02:24:25 -0400 Subject: [PATCH 18/62] Set up Documenter.jl --- docs/make.jl | 6 +++ docs/src/index.md | 45 +++++++++++++++++++ .../ordinary_least_squares_regression.jl | 7 ++- src/metrics/brier_score.jl | 6 ++- ...lotsinglelabelregressiontruevspredicted.jl | 5 +-- src/plotting/probability_calibration_plots.jl | 10 ++--- src/utils/trapz.jl | 5 +++ 7 files changed, 69 insertions(+), 15 deletions(-) create mode 100644 docs/make.jl create mode 100644 docs/src/index.md diff --git a/docs/make.jl b/docs/make.jl new file mode 100644 index 000000000..fb229ab62 --- /dev/null +++ b/docs/make.jl @@ -0,0 +1,6 @@ +import Documenter +import PredictMD + +Documenter.makedocs( + modules = [PredictMD], + ) diff --git a/docs/src/index.md b/docs/src/index.md new file mode 100644 index 000000000..e453d8625 --- /dev/null +++ b/docs/src/index.md @@ -0,0 +1,45 @@ +# PredictMD.jl Documentation + +```@contents +``` + +## Public (exported) functions + +```@autodocs +Modules = [PredictMD] +Public = true +Private = false +Order = [:function] +``` + +## Public (exported) types + +```@autodocs +Modules = [PredictMD] +Public = true +Private = false +Order = [:type] +``` + +## Private functions + +```@autodocs +Modules = [PredictMD] +Public = false +Private = true +Order = [:function] +``` + +## Private types + +```@autodocs +Modules = [PredictMD] +Public = false +Private = true +Order = [:type] +``` + +## Index + +```@index +``` diff --git a/src/linearmodel/ordinary_least_squares_regression.jl b/src/linearmodel/ordinary_least_squares_regression.jl index d89799939..894714855 100644 --- a/src/linearmodel/ordinary_least_squares_regression.jl +++ b/src/linearmodel/ordinary_least_squares_regression.jl @@ -3,11 +3,10 @@ import GLM import StatsModels function ordinary_least_squares_regression( - ; - X::AbstractVector{T} where T <: Real = Real[], - Y::AbstractVector{T} where T <: Real = Real[], + X::AbstractVector{T}, + Y::AbstractVector{T}; intercept::Bool = true, - ) + ) where T <: Real if length(X) != length(Y) error("length(X) != length(Y)") end diff --git a/src/metrics/brier_score.jl b/src/metrics/brier_score.jl index 95a52cd8f..143c2fec0 100644 --- a/src/metrics/brier_score.jl +++ b/src/metrics/brier_score.jl @@ -1,7 +1,9 @@ """ -binary_brier_score(ytrue, yscore) + binary_brier_score(ytrue, yscore) + Computes the binary formulation of the Brier score, defined as: -BS = \frac{1}{N}\sum\limits _{t=1}^{N}(f_t-o_t)^2 \,\! + +\$\\frac{1}{N}\\sum\\limits _{t=1}^{N}(f_t-o_t)^2 \\,\\!\$ Lower values are better. Best value is 0. """ diff --git a/src/plotting/plotsinglelabelregressiontruevspredicted.jl b/src/plotting/plotsinglelabelregressiontruevspredicted.jl index 55e86e1f7..634f8d045 100644 --- a/src/plotting/plotsinglelabelregressiontruevspredicted.jl +++ b/src/plotting/plotsinglelabelregressiontruevspredicted.jl @@ -48,9 +48,8 @@ function plotsinglelabelregressiontrueversuspredicted( ) estimated_intercept, estimated_x_coefficient = ordinary_least_squares_regression( - ; - X = Float64.(ypred), - Y = Float64.(ytrue), + Float64.(ypred), # X + Float64.(ytrue); # Y intercept = true, ) bestfitline_linearplotobject = PGFPlots.Plots.Linear( diff --git a/src/plotting/probability_calibration_plots.jl b/src/plotting/probability_calibration_plots.jl index 87f6dd8a3..afd1fae9c 100644 --- a/src/plotting/probability_calibration_plots.jl +++ b/src/plotting/probability_calibration_plots.jl @@ -120,9 +120,8 @@ function plot_probability_calibration_curve( ) estimated_intercept, estimated_x_coefficient = ordinary_least_squares_regression( - ; - X = scores, - Y = fractions, + scores, # X + fractions; # Y intercept = true, ) bestfitline_linearplotobject = PGFPlots.Plots.Linear( @@ -213,9 +212,8 @@ function probability_calibration_metrics( r2_score_value = r2_score(scores, fractions) estimated_intercept, estimated_x_coefficient = ordinary_least_squares_regression( - ; - X = Float64.(scores), - Y = Float64.(fractions), + Float64.(scores), # X + Float64.(fractions); # Y intercept = true, ) result[Symbol(vectorofestimators[i].name)] = [ diff --git a/src/utils/trapz.jl b/src/utils/trapz.jl index cdae96bd7..fda4f4b0b 100644 --- a/src/utils/trapz.jl +++ b/src/utils/trapz.jl @@ -1,5 +1,10 @@ import NumericalIntegration +""" + trapz(x,y) + +Compute the area under the curve of (x,y) points using the trapezoidal method. +""" function trapz( x::AbstractVector, y::AbstractVector, From cd19273eaee70bcaf642f0fc2c8b3146304e63b6 Mon Sep 17 00:00:00 2001 From: Dilum Aluthge Date: Mon, 21 May 2018 02:24:34 -0400 Subject: [PATCH 19/62] Set up mkdocs --- docs/mkdocs.yml | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) create mode 100644 docs/mkdocs.yml diff --git a/docs/mkdocs.yml b/docs/mkdocs.yml new file mode 100644 index 000000000..2c29b5f76 --- /dev/null +++ b/docs/mkdocs.yml @@ -0,0 +1,25 @@ +site_name: PredictMD.jl +repo_url: https://github.com/bcbi/PredictMD.jl +site_description: Uniform interface for machine learning in Julia +site_author: Center for Biomedical Informatics, Brown University + +theme: readthedocs + +extra_css: + - assets/Documenter.css + +extra_javascript: + - https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.1/MathJax.js?config=TeX-AMS_HTML + - assets/mathjaxhelper.js + +markdown_extensions: + - extra + - tables + - fenced_code + - mdx_math + +docs_dir: 'build' + +pages: + - Home: index.md + From 0f865368e0d384939ef0f6284427b4543d59a0e7 Mon Sep 17 00:00:00 2001 From: Dilum Aluthge Date: Mon, 21 May 2018 02:43:55 -0400 Subject: [PATCH 20/62] Progress commit --- docs/src/index.md | 26 ++++------------------- src/metrics/brier_score.jl | 4 ++-- src/metrics/coefficientofdetermination.jl | 5 +++-- src/utils/trapz.jl | 2 +- 4 files changed, 10 insertions(+), 27 deletions(-) diff --git a/docs/src/index.md b/docs/src/index.md index e453d8625..ce0cc136f 100644 --- a/docs/src/index.md +++ b/docs/src/index.md @@ -3,42 +3,24 @@ ```@contents ``` -## Public (exported) functions +## Types ```@autodocs Modules = [PredictMD] Public = true -Private = false -Order = [:function] -``` - -## Public (exported) types - -```@autodocs -Modules = [PredictMD] -Public = true -Private = false +Private = true Order = [:type] ``` -## Private functions +## Functions ```@autodocs Modules = [PredictMD] -Public = false +Public = true Private = true Order = [:function] ``` -## Private types - -```@autodocs -Modules = [PredictMD] -Public = false -Private = true -Order = [:type] -``` - ## Index ```@index diff --git a/src/metrics/brier_score.jl b/src/metrics/brier_score.jl index 143c2fec0..361538a08 100644 --- a/src/metrics/brier_score.jl +++ b/src/metrics/brier_score.jl @@ -1,9 +1,9 @@ -""" +doc""" binary_brier_score(ytrue, yscore) Computes the binary formulation of the Brier score, defined as: -\$\\frac{1}{N}\\sum\\limits _{t=1}^{N}(f_t-o_t)^2 \\,\\!\$ +$\frac{1}{N}\sum\limits _{t=1}^{N}(f_t-o_t)^2 \,\!\$ Lower values are better. Best value is 0. """ diff --git a/src/metrics/coefficientofdetermination.jl b/src/metrics/coefficientofdetermination.jl index a4f0d708c..0fb5873c4 100644 --- a/src/metrics/coefficientofdetermination.jl +++ b/src/metrics/coefficientofdetermination.jl @@ -1,7 +1,8 @@ import StatsBase -""" -r2_score(ytrue, ypred) +doc""" + r2_score(ytrue, ypred) + Computes coefficient of determination. Higher values are better. Best value is 1. """ diff --git a/src/utils/trapz.jl b/src/utils/trapz.jl index fda4f4b0b..76b195789 100644 --- a/src/utils/trapz.jl +++ b/src/utils/trapz.jl @@ -1,6 +1,6 @@ import NumericalIntegration -""" +doc""" trapz(x,y) Compute the area under the curve of (x,y) points using the trapezoidal method. From 7e7ff76e8f1af2caf7c72948152939146c49ca2d Mon Sep 17 00:00:00 2001 From: Dilum Aluthge Date: Mon, 21 May 2018 04:47:25 -0400 Subject: [PATCH 21/62] Update gitignore --- .gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitignore b/.gitignore index 2c762178b..a127ba002 100644 --- a/.gitignore +++ b/.gitignore @@ -3,6 +3,7 @@ *.ipynb_checkpoints/ data/ docs/build/ +docs/generated/ docs/site/ input/ output/ From 6ea4cb65361f7cda40cbe0e6058de48c2e50b1d5 Mon Sep 17 00:00:00 2001 From: Dilum Aluthge Date: Mon, 21 May 2018 04:47:36 -0400 Subject: [PATCH 22/62] Improve docs --- docs/deploy_docs.jl | 2 + docs/make.jl | 6 --- docs/make_docs.jl | 11 +++++ docs/mkdocs.yml | 2 + docs/src/index.md | 25 ----------- docs/src/library/internals.md | 45 +++++++++++++++++++ src/PredictMD.jl | 1 + src/base/interface.jl | 0 src/base/types.jl | 17 +++++++ src/base/version.jl | 5 +++ src/classimbalance/smote.jl | 6 +++ src/gpu/cudnn.jl | 2 + src/io/saveload.jl | 4 ++ src/linearmodel/glm.jl | 26 +++++++++++ .../ordinary_least_squares_regression.jl | 2 + src/metrics/auprc.jl | 2 + src/metrics/aurocc.jl | 18 ++++---- src/metrics/averageprecisionscore.jl | 4 ++ src/metrics/brier_score.jl | 6 ++- src/metrics/coefficientofdetermination.jl | 2 +- src/metrics/cohenkappa.jl | 12 +++++ src/metrics/getbinarythresholds.jl | 2 + src/metrics/mean_square_error.jl | 6 +++ src/metrics/prcurve.jl | 4 ++ src/metrics/risk_score_cutoff_values.jl | 4 ++ src/metrics/roccurve.jl | 4 ++ src/metrics/rocnumsmetrics.jl | 28 ++++++++++++ .../singlelabelbinaryclassificationmetrics.jl | 12 +++++ src/metrics/singlelabelregressionmetrics.jl | 10 +++++ src/modelselection/split_data.jl | 4 ++ src/neuralnetwork/knet.jl | 22 +++++++++ src/pipeline/simplelinearpipeline.jl | 16 +++++++ src/plotting/plotlearningcurve.jl | 14 ++++-- src/plotting/plotprcurve.jl | 10 +++-- src/plotting/plotroccurve.jl | 10 +++-- ...otsinglelabelbinaryclassifierhistograms.jl | 2 + ...lotsinglelabelregressiontruevspredicted.jl | 2 + src/plotting/probability_calibration_plots.jl | 12 +++++ src/postprocessing/packagemultilabelpred.jl | 14 ++++++ src/postprocessing/packagesinglelabelpred.jl | 14 ++++++ src/postprocessing/packagesinglelabelproba.jl | 14 ++++++ src/postprocessing/predictoutput.jl | 16 +++++++ src/postprocessing/predictprobaoutput.jl | 14 ++++++ src/preprocessing/dataframecontrasts.jl | 6 +++ src/preprocessing/dataframetodecisiontree.jl | 18 ++++++++ src/preprocessing/dataframetoglm.jl | 19 +++++++- src/preprocessing/dataframetoknet.jl | 36 +++++++++++++++ src/preprocessing/dataframetosvm.jl | 20 +++++++++ src/svm/libsvm.jl | 22 +++++++++ src/tree/decisiontree.jl | 24 ++++++++++ src/utils/fix_dict_type.jl | 2 + src/utils/fix_vector_type.jl | 4 ++ src/utils/formulas.jl | 10 +++++ src/utils/labelstringintmaps.jl | 4 ++ src/utils/nothings.jl | 6 +++ src/utils/openbrowserwindow.jl | 2 + src/utils/openplotsduringtestsenv.jl | 2 + src/utils/predictionsassoctodataframe.jl | 2 + src/utils/probabilitiestopredictions.jl | 4 ++ src/utils/runtestsenv.jl | 2 + src/utils/shufflerows.jl | 4 ++ src/utils/simplemovingaverage.jl | 2 + src/utils/tikzpictures.jl | 14 ++++++ src/utils/trapz.jl | 2 +- src/utils/traviscienv.jl | 2 + 65 files changed, 585 insertions(+), 54 deletions(-) create mode 100644 docs/deploy_docs.jl delete mode 100644 docs/make.jl create mode 100644 docs/make_docs.jl create mode 100644 docs/src/library/internals.md create mode 100644 src/base/interface.jl diff --git a/docs/deploy_docs.jl b/docs/deploy_docs.jl new file mode 100644 index 000000000..c95e30b6c --- /dev/null +++ b/docs/deploy_docs.jl @@ -0,0 +1,2 @@ +import Documenter +import PredictMD diff --git a/docs/make.jl b/docs/make.jl deleted file mode 100644 index fb229ab62..000000000 --- a/docs/make.jl +++ /dev/null @@ -1,6 +0,0 @@ -import Documenter -import PredictMD - -Documenter.makedocs( - modules = [PredictMD], - ) diff --git a/docs/make_docs.jl b/docs/make_docs.jl new file mode 100644 index 000000000..39c6fd454 --- /dev/null +++ b/docs/make_docs.jl @@ -0,0 +1,11 @@ +import Documenter +import PredictMD + +Documenter.makedocs( + modules = [PredictMD], + sitename = "PredictMD.jl", + pages = Any[ + "index.md", + "library/internals.md", + ], + ) diff --git a/docs/mkdocs.yml b/docs/mkdocs.yml index 2c29b5f76..8e1b87574 100644 --- a/docs/mkdocs.yml +++ b/docs/mkdocs.yml @@ -22,4 +22,6 @@ docs_dir: 'build' pages: - Home: index.md + - Library: + - 'Internals': 'library/internals.md' diff --git a/docs/src/index.md b/docs/src/index.md index ce0cc136f..8269835ff 100644 --- a/docs/src/index.md +++ b/docs/src/index.md @@ -1,27 +1,2 @@ # PredictMD.jl Documentation -```@contents -``` - -## Types - -```@autodocs -Modules = [PredictMD] -Public = true -Private = true -Order = [:type] -``` - -## Functions - -```@autodocs -Modules = [PredictMD] -Public = true -Private = true -Order = [:function] -``` - -## Index - -```@index -``` diff --git a/docs/src/library/internals.md b/docs/src/library/internals.md new file mode 100644 index 000000000..bff731afd --- /dev/null +++ b/docs/src/library/internals.md @@ -0,0 +1,45 @@ +# Documentation of internals + +```@contents +Pages = ["internals.md"] +``` + +## Modules + +```@autodocs +Modules = [PredictMD] +Order = [:module] +``` + +## Constants + +```@autodocs +Modules = [PredictMD] +Order = [:constant] +``` + +## Types + +```@autodocs +Modules = [PredictMD] +Order = [:type] +``` + +## Functions + +```@autodocs +Modules = [PredictMD] +Order = [:function] +``` + +## Macros + +```@autodocs +Modules = [PredictMD] +Order = [:macro] +``` + +## Index + +```@index +``` diff --git a/src/PredictMD.jl b/src/PredictMD.jl index 626fc37f7..ce93e9c75 100644 --- a/src/PredictMD.jl +++ b/src/PredictMD.jl @@ -3,6 +3,7 @@ __precompile__(true) module PredictMD # base/ +include("base/interface.jl") include("base/types.jl") include("base/version.jl") diff --git a/src/base/interface.jl b/src/base/interface.jl new file mode 100644 index 000000000..e69de29bb diff --git a/src/base/types.jl b/src/base/types.jl index ecdf0f872..f8ed7d4de 100644 --- a/src/base/types.jl +++ b/src/base/types.jl @@ -1,9 +1,26 @@ +""" + AbstractEstimator +""" abstract type AbstractEstimator end +""" + AbstractFeatureContrasts +""" abstract type AbstractFeatureContrasts end +""" + AbstractPipeline +""" abstract type AbstractPipeline end +""" + AbstractTransformer +""" abstract type AbstractTransformer end const Fittable = Union{AbstractEstimator,AbstractPipeline,AbstractTransformer} + +""" + Fittable +""" +Fittable diff --git a/src/base/version.jl b/src/base/version.jl index 576dd2f68..2425250d3 100644 --- a/src/base/version.jl +++ b/src/base/version.jl @@ -4,3 +4,8 @@ catch e warn("WARN While creating PredictMD.VERSION, ignoring error $(e)") VersionNumber(0) end + +""" + VERSION +""" +VERSION diff --git a/src/classimbalance/smote.jl b/src/classimbalance/smote.jl index 41a6020cd..715a01cb1 100644 --- a/src/classimbalance/smote.jl +++ b/src/classimbalance/smote.jl @@ -1,6 +1,8 @@ import ClassImbalance import DataFrames +""" +""" function calculate_smote_pct_under( ; pct_over::Real = 0, @@ -16,6 +18,8 @@ function calculate_smote_pct_under( return result end +""" +""" function smote( features_df::DataFrames.AbstractDataFrame, labels_df::DataFrames.AbstractDataFrame, @@ -42,6 +46,8 @@ function smote( return result end +""" +""" function smote( rng::AbstractRNG, features_df::DataFrames.AbstractDataFrame, diff --git a/src/gpu/cudnn.jl b/src/gpu/cudnn.jl index 8846b285b..d30a02768 100644 --- a/src/gpu/cudnn.jl +++ b/src/gpu/cudnn.jl @@ -1,5 +1,7 @@ import Requires +""" +""" has_cudnn() = false Requires.@require CUDNN begin diff --git a/src/io/saveload.jl b/src/io/saveload.jl index f7ea8f46d..60c0a79b0 100644 --- a/src/io/saveload.jl +++ b/src/io/saveload.jl @@ -3,6 +3,8 @@ import FileIO import JLD2 import ProgressMeter +""" +""" function save_model(filename::AbstractString,fittable_object_to_save::Fittable) # make sure that the filename ends in ".jld2" if lowercase(strip(splitext(filename)[2])) != ".jld2" @@ -26,6 +28,8 @@ function save_model(filename::AbstractString,fittable_object_to_save::Fittable) return nothing end +""" +""" function load_model(filename::AbstractString) # make sure that the filename ends in ".jld2" if lowercase(strip(splitext(filename)[2])) != ".jld2" diff --git a/src/linearmodel/glm.jl b/src/linearmodel/glm.jl index b759b51f3..fb23a8c95 100644 --- a/src/linearmodel/glm.jl +++ b/src/linearmodel/glm.jl @@ -2,6 +2,8 @@ import DataFrames import GLM import StatsModels +""" +""" mutable struct GLMModel <: AbstractEstimator name::T1 where T1 <: AbstractString isclassificationmodel::T2 where T2 <: Bool @@ -36,6 +38,8 @@ mutable struct GLMModel <: AbstractEstimator end end +""" +""" function get_history( x::GLMModel; saving::Bool = false, @@ -44,6 +48,8 @@ function get_history( return nothing end +""" +""" function set_feature_contrasts!( x::GLMModel, feature_contrasts::AbstractFeatureContrasts, @@ -51,6 +57,8 @@ function set_feature_contrasts!( return nothing end +""" +""" function get_underlying( x::GLMModel; saving::Bool = false, @@ -60,6 +68,8 @@ function get_underlying( return result end +""" +""" function fit!( estimator::GLMModel, features_df::DataFrames.AbstractDataFrame, @@ -89,6 +99,8 @@ function fit!( return estimator end +""" +""" function predict( estimator::GLMModel, features_df::DataFrames.AbstractDataFrame, @@ -123,6 +135,8 @@ function predict( end end +""" +""" function predict_proba( estimator::GLMModel, features_df::DataFrames.AbstractDataFrame, @@ -148,6 +162,8 @@ function predict_proba( end end +""" +""" function _singlelabelbinaryclassdataframelogisticclassifier_GLM( featurenames::AbstractVector, singlelabelname::Symbol, @@ -199,6 +215,8 @@ function _singlelabelbinaryclassdataframelogisticclassifier_GLM( return finalpipeline end +""" +""" function singlelabelbinaryclassdataframelogisticclassifier( featurenames::AbstractVector, singlelabelname::Symbol, @@ -223,6 +241,8 @@ function singlelabelbinaryclassdataframelogisticclassifier( end end +""" +""" function _singlelabelbinaryclassdataframeprobitclassifier_GLM( featurenames::AbstractVector, singlelabelname::Symbol, @@ -274,6 +294,8 @@ function _singlelabelbinaryclassdataframeprobitclassifier_GLM( return finalpipeline end +""" +""" function singlelabelbinaryclassdataframeprobitclassifier( featurenames::AbstractVector, singlelabelname::Symbol, @@ -298,6 +320,8 @@ function singlelabelbinaryclassdataframeprobitclassifier( end end +""" +""" function _singlelabeldataframelinearregression_GLM( featurenames::AbstractVector, singlelabelname::Symbol; @@ -327,6 +351,8 @@ function _singlelabeldataframelinearregression_GLM( return finalpipeline end +""" +""" function singlelabeldataframelinearregression( featurenames::AbstractVector, singlelabelname::Symbol; diff --git a/src/linearmodel/ordinary_least_squares_regression.jl b/src/linearmodel/ordinary_least_squares_regression.jl index 894714855..7681ffb3f 100644 --- a/src/linearmodel/ordinary_least_squares_regression.jl +++ b/src/linearmodel/ordinary_least_squares_regression.jl @@ -2,6 +2,8 @@ import DataFrames import GLM import StatsModels +""" +""" function ordinary_least_squares_regression( X::AbstractVector{T}, Y::AbstractVector{T}; diff --git a/src/metrics/auprc.jl b/src/metrics/auprc.jl index 72483ce5c..8e0b0cbb8 100644 --- a/src/metrics/auprc.jl +++ b/src/metrics/auprc.jl @@ -2,6 +2,8 @@ import MLBase import NumericalIntegration import StatsBase +""" +""" function auprc( ytrue::AbstractVector{<:Integer}, yscore::AbstractVector{<:Real}, diff --git a/src/metrics/aurocc.jl b/src/metrics/aurocc.jl index c64c7ae58..ae0012b54 100644 --- a/src/metrics/aurocc.jl +++ b/src/metrics/aurocc.jl @@ -2,7 +2,7 @@ import MLBase import ROCAnalysis import StatsBase -function _aurocc( +function _aurocc_trapz( ytrue::AbstractVector{<:Integer}, yscore::AbstractVector{<:Real}, ) @@ -18,8 +18,8 @@ function _aurocc( # x = allfpr y = alltpr - aurocc_result = trapz(x, y) - return aurocc_result + aurocc_trapz_result = trapz(x, y) + return aurocc_trapz_result end function _aurocc_verify( @@ -32,15 +32,17 @@ function _aurocc_verify( nontargetscores = yscore[ytrue .== nontargetlevel] r = ROCAnalysis.roc(targetscores, nontargetscores) complement_of_aurocc = ROCAnalysis.auc(r) - aurocc_result = 1 - complement_of_aurocc - return aurocc_result + aurocc_verify_result = 1 - complement_of_aurocc + return aurocc_verify_result end +""" +""" function aurocc( ytrue::AbstractVector{<:Integer}, yscore::AbstractVector{<:Real}, ) - aurocc_value = _aurocc( + aurocc_trapz_value = _aurocc_trapz( ytrue, yscore, ) @@ -48,8 +50,8 @@ function aurocc( ytrue, yscore, ) - if !( isapprox(aurocc_value, aurocc_verify_value; atol=0.00000001) ) + if !( isapprox(aurocc_trapz_value, aurocc_verify_value; atol=0.00000001) ) error("Was not able to accurately compute the AUROCC.") end - return aurocc_value + return aurocc_trapz_value end diff --git a/src/metrics/averageprecisionscore.jl b/src/metrics/averageprecisionscore.jl index 41458762a..ea3e00303 100644 --- a/src/metrics/averageprecisionscore.jl +++ b/src/metrics/averageprecisionscore.jl @@ -1,6 +1,8 @@ import MLBase import StatsBase +""" +""" function avg_precision( allprecisions::AbstractVector{<:Real}, allrecalls::AbstractVector{<:Real}, @@ -25,6 +27,8 @@ function avg_precision( return result end +""" +""" function averageprecisionscore( ytrue::AbstractVector{<:Integer}, yscore::AbstractVector{<:Real}, diff --git a/src/metrics/brier_score.jl b/src/metrics/brier_score.jl index 361538a08..f827b4358 100644 --- a/src/metrics/brier_score.jl +++ b/src/metrics/brier_score.jl @@ -1,9 +1,11 @@ -doc""" +""" binary_brier_score(ytrue, yscore) Computes the binary formulation of the Brier score, defined as: -$\frac{1}{N}\sum\limits _{t=1}^{N}(f_t-o_t)^2 \,\!\$ +```math +\\frac{1}{N}\\sum\\limits _{t=1}^{N}(f_t-o_t)^2 \\,\\! +``` Lower values are better. Best value is 0. """ diff --git a/src/metrics/coefficientofdetermination.jl b/src/metrics/coefficientofdetermination.jl index 0fb5873c4..16c32c168 100644 --- a/src/metrics/coefficientofdetermination.jl +++ b/src/metrics/coefficientofdetermination.jl @@ -1,6 +1,6 @@ import StatsBase -doc""" +""" r2_score(ytrue, ypred) Computes coefficient of determination. Higher values are better. Best value diff --git a/src/metrics/cohenkappa.jl b/src/metrics/cohenkappa.jl index fde251829..01d59c8ab 100644 --- a/src/metrics/cohenkappa.jl +++ b/src/metrics/cohenkappa.jl @@ -1,7 +1,11 @@ import MLBase +""" +""" is_square(m::AbstractMatrix) = size(m, 1) == size(m, 2) +""" +""" function cohen_kappa(contingency_table::AbstractMatrix) if !is_square(contingency_table) error("contingency_table must be a square matrix") @@ -25,6 +29,8 @@ function cohen_kappa(contingency_table::AbstractMatrix) return kappa end +""" +""" function compute_contingency_table(y1::AbstractVector, y2::AbstractVector) classes = sort(unique(vcat(y1, y2)); rev = false,) numclasses = length(classes) @@ -34,18 +40,24 @@ function compute_contingency_table(y1::AbstractVector, y2::AbstractVector) return contingency_table end +""" +""" function cohen_kappa(y1::AbstractVector, y2::AbstractVector) contingency_table = compute_contingency_table(y1, y2) result = cohen_kappa(contingency_table) return result end +""" +""" function compute_contingency_table(rocnums::MLBase.ROCNums) # we will arbitrarily set rows = predicted, columns = true/gold contingency_table = [rocnums.tp rocnums.fp; rocnums.fn rocnums.tp] return contingency_table end +""" +""" function cohen_kappa(rocnums::MLBase.ROCNums) contingency_table = compute_contingency_table(rocnums) result = cohen_kappa(contingency_table) diff --git a/src/metrics/getbinarythresholds.jl b/src/metrics/getbinarythresholds.jl index a1a8699ed..4bae59608 100644 --- a/src/metrics/getbinarythresholds.jl +++ b/src/metrics/getbinarythresholds.jl @@ -1,5 +1,7 @@ import StatsBase +""" +""" function get_binary_thresholds( yscore::AbstractVector{<:Real}; additionalthreshold::Real = 0.5, diff --git a/src/metrics/mean_square_error.jl b/src/metrics/mean_square_error.jl index 78eb3fc44..194ad49db 100644 --- a/src/metrics/mean_square_error.jl +++ b/src/metrics/mean_square_error.jl @@ -1,3 +1,6 @@ +""" + mean_square_error(ytrue, ypred) +""" function mean_square_error( ytrue::AbstractVector{<:Real}, ypred::AbstractVector{<:Real}, @@ -6,6 +9,9 @@ function mean_square_error( return result end +""" + root_mean_square_error(ytrue, ypred) +""" root_mean_square_error(ytrue,ypred) = sqrt(mean_square_error(ytrue,ypred)) # convenience aliases for mean squared error: diff --git a/src/metrics/prcurve.jl b/src/metrics/prcurve.jl index 2fdebf7f3..a9de1a67a 100644 --- a/src/metrics/prcurve.jl +++ b/src/metrics/prcurve.jl @@ -1,6 +1,8 @@ import MLBase import StatsBase +""" +""" function prcurve( ytrue::AbstractVector{<:Integer}, yscore::AbstractVector{<:Real}, @@ -16,6 +18,8 @@ function prcurve( return result end +""" +""" function prcurve( allrocnums::AbstractVector{<:MLBase.ROCNums}, allthresholds::AbstractVector{<:Real}, diff --git a/src/metrics/risk_score_cutoff_values.jl b/src/metrics/risk_score_cutoff_values.jl index 2074f4799..8cdf55bf6 100644 --- a/src/metrics/risk_score_cutoff_values.jl +++ b/src/metrics/risk_score_cutoff_values.jl @@ -1,5 +1,7 @@ import DataFrames +""" +""" function risk_score_cutoff_values( estimator::Fittable, features_df::DataFrames.AbstractDataFrame, @@ -34,6 +36,8 @@ function risk_score_cutoff_values( return cutoffs, risk_group_prevalences end +""" +""" function risk_score_cutoff_values( ytrue::AbstractVector{<:Integer}, yscore::AbstractVector{<:AbstractFloat}; diff --git a/src/metrics/roccurve.jl b/src/metrics/roccurve.jl index 2d6220b46..f1f631616 100644 --- a/src/metrics/roccurve.jl +++ b/src/metrics/roccurve.jl @@ -1,6 +1,8 @@ import MLBase import StatsBase +""" +""" function roccurve( ytrue::AbstractVector{<:Integer}, yscore::AbstractVector{<:Real}, @@ -16,6 +18,8 @@ function roccurve( return result end +""" +""" function roccurve( allrocnums::AbstractVector{<:MLBase.ROCNums}, allthresholds::AbstractVector{<:Real}, diff --git a/src/metrics/rocnumsmetrics.jl b/src/metrics/rocnumsmetrics.jl index 31550a4ad..13bd592d5 100644 --- a/src/metrics/rocnumsmetrics.jl +++ b/src/metrics/rocnumsmetrics.jl @@ -1,6 +1,8 @@ import MLBase import StatsBase +""" +""" function getallrocnums( ytrue::AbstractVector{<:Integer}, yscore::AbstractVector{<:Real}; @@ -18,16 +20,28 @@ function getallrocnums( return allrocnums, allthresholds end +""" +""" accuracy(x::MLBase.ROCNums) = (x.tp + x.tn)/(x.p + x.n) +""" +""" true_positive_rate(x::MLBase.ROCNums) = (x.tp)/(x.p) +""" +""" true_negative_rate(x::MLBase.ROCNums) = (x.tn)/(x.n) +""" +""" false_positive_rate(x::MLBase.ROCNums) = (x.fp)/(x.n) +""" +""" false_negative_rate(x::MLBase.ROCNums) = (x.fn)/(x.p) +""" +""" function positive_predictive_value(x::MLBase.ROCNums) if (x.tp == 0) && (x.tp + x.fp == 0) result = 1 @@ -39,6 +53,8 @@ function positive_predictive_value(x::MLBase.ROCNums) return result end +""" +""" function negative_predictive_value(x::MLBase.ROCNums) if (x.tn == 0) && (x.tn + x.fn ==0) result = 1 @@ -50,14 +66,24 @@ function negative_predictive_value(x::MLBase.ROCNums) return result end +""" +""" sensitivity(x::MLBase.ROCNums) = true_positive_rate(x) +""" +""" specificity(x::MLBase.ROCNums) = true_negative_rate(x) +""" +""" precision(x::MLBase.ROCNums) = positive_predictive_value(x) +""" +""" recall(x::MLBase.ROCNums) = true_positive_rate(x) +""" +""" function fbetascore( x::MLBase.ROCNums, beta::Real, @@ -68,4 +94,6 @@ function fbetascore( return result end +""" +""" f1score(x::MLBase.ROCNums) = fbetascore(x, 1) diff --git a/src/metrics/singlelabelbinaryclassificationmetrics.jl b/src/metrics/singlelabelbinaryclassificationmetrics.jl index fce8e3c0f..2305d43ba 100644 --- a/src/metrics/singlelabelbinaryclassificationmetrics.jl +++ b/src/metrics/singlelabelbinaryclassificationmetrics.jl @@ -2,6 +2,8 @@ import DataFrames import MLBase import StatsBase +""" +""" function singlelabelbinaryytrue( labels::AbstractVector, positiveclass::AbstractString; @@ -14,6 +16,8 @@ function singlelabelbinaryytrue( return result end +""" +""" function singlelabelbinaryyscore( singlelabelprobabilities::Associative, positiveclass::AbstractString; @@ -26,6 +30,8 @@ function singlelabelbinaryyscore( return result end +""" +""" function _singlelabelbinaryclassificationmetrics_tunableparam( kwargsassoc::Associative, ) @@ -102,6 +108,8 @@ function _singlelabelbinaryclassificationmetrics_tunableparam( return selectedtunableparam, selectedparamtomax, metricprintnames end +""" +""" function _singlelabelbinaryclassificationmetrics( estimator::Fittable, features_df::DataFrames.AbstractDataFrame, @@ -189,6 +197,8 @@ function _singlelabelbinaryclassificationmetrics( return results end +""" +""" function singlelabelbinaryclassificationmetrics( estimator::Fittable, features_df::DataFrames.AbstractDataFrame, @@ -209,6 +219,8 @@ function singlelabelbinaryclassificationmetrics( return result end +""" +""" function singlelabelbinaryclassificationmetrics( vectorofestimators::AbstractVector{Fittable}, features_df::DataFrames.AbstractDataFrame, diff --git a/src/metrics/singlelabelregressionmetrics.jl b/src/metrics/singlelabelregressionmetrics.jl index 2dc0ed6aa..c416f358b 100644 --- a/src/metrics/singlelabelregressionmetrics.jl +++ b/src/metrics/singlelabelregressionmetrics.jl @@ -2,6 +2,8 @@ import DataFrames import MLBase import StatsBase +""" +""" function singlelabelregressionytrue( labels::AbstractVector; floattype::Type = Cfloat, @@ -13,6 +15,8 @@ function singlelabelregressionytrue( return result end +""" +""" function singlelabelregressionypred( labels::AbstractVector; floattype::Type = Cfloat, @@ -24,6 +28,8 @@ function singlelabelregressionypred( return result end +""" +""" function _singlelabelregressionmetrics( estimator::Fittable, features_df::DataFrames.AbstractDataFrame, @@ -54,6 +60,8 @@ function _singlelabelregressionmetrics( return results end +""" +""" function singlelabelregressionmetrics( estimator::Fittable, features_df::DataFrames.AbstractDataFrame, @@ -70,6 +78,8 @@ function singlelabelregressionmetrics( return result end +""" +""" function singlelabelregressionmetrics( vectorofestimators::AbstractVector{Fittable}, features_df::DataFrames.AbstractDataFrame, diff --git a/src/modelselection/split_data.jl b/src/modelselection/split_data.jl index 82b5ebe7d..1f4da4f4f 100644 --- a/src/modelselection/split_data.jl +++ b/src/modelselection/split_data.jl @@ -1,6 +1,8 @@ import DataFrames import StatsBase +""" +""" function split_data( features_df::DataFrames.AbstractDataFrame, labels_df::DataFrames.AbstractDataFrame, @@ -15,6 +17,8 @@ function split_data( return result end +""" +""" function split_data( rng::AbstractRNG, features_df::DataFrames.AbstractDataFrame, diff --git a/src/neuralnetwork/knet.jl b/src/neuralnetwork/knet.jl index 1b6d1b45c..5015125d2 100644 --- a/src/neuralnetwork/knet.jl +++ b/src/neuralnetwork/knet.jl @@ -2,6 +2,8 @@ import Knet import ProgressMeter import ValueHistories +""" +""" mutable struct KnetModel <: AbstractEstimator name::T1 where T1 <: AbstractString isclassificationmodel::T2 where T2 <: Bool @@ -84,6 +86,8 @@ mutable struct KnetModel <: AbstractEstimator end end +""" +""" function set_feature_contrasts!( x::KnetModel, feature_contrasts::AbstractFeatureContrasts, @@ -91,6 +95,8 @@ function set_feature_contrasts!( return nothing end +""" +""" function get_underlying( x::KnetModel; saving::Bool = false, @@ -100,6 +106,8 @@ function get_underlying( return result end +""" +""" function get_history( x::KnetModel; saving::Bool = false, @@ -109,6 +117,8 @@ function get_history( return result end +""" +""" function fit!( estimator::KnetModel, training_features_array::AbstractArray, @@ -296,6 +306,8 @@ function fit!( return estimator end +""" +""" function predict( estimator::KnetModel, featuresarray::AbstractArray, @@ -322,6 +334,8 @@ function predict( end end +""" +""" function predict_proba( estimator::KnetModel, featuresarray::AbstractArray, @@ -346,6 +360,8 @@ function predict_proba( end end +""" +""" function _singlelabelmulticlassdataframeknetclassifier_Knet( featurenames::AbstractVector, singlelabelname::Symbol, @@ -423,6 +439,8 @@ function _singlelabelmulticlassdataframeknetclassifier_Knet( return finalpipeline end +""" +""" function singlelabelmulticlassdataframeknetclassifier( featurenames::AbstractVector, singlelabelname::Symbol, @@ -463,6 +481,8 @@ function singlelabelmulticlassdataframeknetclassifier( end end +""" +""" function _singlelabeldataframeknetregression_Knet( featurenames::AbstractVector, singlelabelname::Symbol; @@ -520,6 +540,8 @@ function _singlelabeldataframeknetregression_Knet( return finalpipeline end +""" +""" function singlelabeldataframeknetregression( featurenames::AbstractVector, singlelabelname::Symbol; diff --git a/src/pipeline/simplelinearpipeline.jl b/src/pipeline/simplelinearpipeline.jl index cd6081c59..df724c712 100644 --- a/src/pipeline/simplelinearpipeline.jl +++ b/src/pipeline/simplelinearpipeline.jl @@ -1,8 +1,12 @@ +""" +""" struct SimplePipeline <: AbstractPipeline name::T1 where T1 <: AbstractString objectsvector::T2 where T2 <: AbstractVector{Fittable} end +""" +""" function SimplePipeline( objectsvector::AbstractVector{Fittable}; name::AbstractString = "", @@ -14,6 +18,8 @@ function SimplePipeline( return result end +""" +""" function set_feature_contrasts!( x::SimplePipeline, feature_contrasts::AbstractFeatureContrasts, @@ -24,6 +30,8 @@ function set_feature_contrasts!( return nothing end +""" +""" function get_underlying( x::SimplePipeline; saving::Bool = false, @@ -49,6 +57,8 @@ function get_underlying( return underlying end +""" +""" function get_history( x::SimplePipeline; saving::Bool = false, @@ -74,6 +84,8 @@ function get_history( return history end +""" +""" function fit!( simplelinearpipeline::SimplePipeline, varargs...; @@ -98,6 +110,8 @@ function fit!( return output end +""" +""" function predict( simplelinearpipeline::SimplePipeline, varargs...; @@ -122,6 +136,8 @@ function predict( return output end +""" +""" function predict_proba( simplelinearpipeline::SimplePipeline, varargs...; diff --git a/src/plotting/plotlearningcurve.jl b/src/plotting/plotlearningcurve.jl index 6c16c0bcc..1d5a1d675 100644 --- a/src/plotting/plotlearningcurve.jl +++ b/src/plotting/plotlearningcurve.jl @@ -4,7 +4,9 @@ import PGFPlotsX import StatsBase import ValueHistories -function plotlearningcurve( +""" +""" +function plotlearningcurves( inputobject::Fittable, curvetype::Symbol = :loss_vs_iteration; window::Integer = 0, @@ -32,7 +34,9 @@ function plotlearningcurve( return result end -function plotlearningcurve( +""" +""" +function plotlearningcurves( history::ValueHistories.MultivalueHistory, curvetype::Symbol = :loss_vs_iteration; window::Integer = 0, @@ -157,7 +161,9 @@ function plotlearningcurve( return result end -function plotlearningcurve( +""" +""" +function plotlearningcurves( xvalues::AbstractVector{<:Real}, training_yvalues::AbstractVector{<:Real}, xlabel::AbstractString, @@ -264,4 +270,4 @@ function plotlearningcurve( return tikzpicture end -const plotlearningcurves = plotlearningcurve +const plotlearningcurve = plotlearningcurves diff --git a/src/plotting/plotprcurve.jl b/src/plotting/plotprcurve.jl index 3927c101c..9a5cca617 100644 --- a/src/plotting/plotprcurve.jl +++ b/src/plotting/plotprcurve.jl @@ -2,7 +2,9 @@ import LaTeXStrings import PGFPlots import PGFPlotsX -function plotprcurve( +""" +""" +function plotprcurves( estimator::Fittable, features_df::DataFrames.AbstractDataFrame, labels_df::DataFrames.AbstractDataFrame, @@ -20,7 +22,9 @@ function plotprcurve( return result end -function plotprcurve( +""" +""" +function plotprcurves( vectorofestimators::AbstractVector{Fittable}, features_df::DataFrames.AbstractDataFrame, labels_df::DataFrames.AbstractDataFrame, @@ -65,4 +69,4 @@ function plotprcurve( return tikzpicture end -const plotprcurves = plotprcurve +const plotprcurve = plotprcurves diff --git a/src/plotting/plotroccurve.jl b/src/plotting/plotroccurve.jl index 0b5c0b6c0..f5439499d 100644 --- a/src/plotting/plotroccurve.jl +++ b/src/plotting/plotroccurve.jl @@ -2,7 +2,9 @@ import LaTeXStrings import PGFPlots import PGFPlotsX -function plotroccurve( +""" +""" +function plotroccurves( estimator::Fittable, features_df::DataFrames.AbstractDataFrame, labels_df::DataFrames.AbstractDataFrame, @@ -20,7 +22,9 @@ function plotroccurve( return result end -function plotroccurve( +""" +""" +function plotroccurves( vectorofestimators::AbstractVector{Fittable}, features_df::DataFrames.AbstractDataFrame, labels_df::DataFrames.AbstractDataFrame, @@ -65,4 +69,4 @@ function plotroccurve( return tikzpicture end -const plotroccurves = plotroccurve +const plotroccurve = plotroccurves diff --git a/src/plotting/plotsinglelabelbinaryclassifierhistograms.jl b/src/plotting/plotsinglelabelbinaryclassifierhistograms.jl index 348f10edc..f07699c68 100644 --- a/src/plotting/plotsinglelabelbinaryclassifierhistograms.jl +++ b/src/plotting/plotsinglelabelbinaryclassifierhistograms.jl @@ -2,6 +2,8 @@ import LaTeXStrings import PGFPlots import PGFPlotsX +""" +""" function plotsinglelabelbinaryclassifierhistogram( estimator::Fittable, features_df::DataFrames.AbstractDataFrame, diff --git a/src/plotting/plotsinglelabelregressiontruevspredicted.jl b/src/plotting/plotsinglelabelregressiontruevspredicted.jl index 634f8d045..2c1644552 100644 --- a/src/plotting/plotsinglelabelregressiontruevspredicted.jl +++ b/src/plotting/plotsinglelabelregressiontruevspredicted.jl @@ -2,6 +2,8 @@ import LaTeXStrings import PGFPlots import PGFPlotsX +""" +""" function plotsinglelabelregressiontrueversuspredicted( estimator::Fittable, features_df::DataFrames.AbstractDataFrame, diff --git a/src/plotting/probability_calibration_plots.jl b/src/plotting/probability_calibration_plots.jl index afd1fae9c..07cde90b0 100644 --- a/src/plotting/probability_calibration_plots.jl +++ b/src/plotting/probability_calibration_plots.jl @@ -2,6 +2,8 @@ import LaTeXStrings import PGFPlots import PGFPlotsX +""" +""" function probability_calibration_scores_and_fractions( estimator::Fittable, features_df::DataFrames.AbstractDataFrame, @@ -33,6 +35,8 @@ function probability_calibration_scores_and_fractions( return scores, fractions end +""" +""" function probability_calibration_scores_and_fractions( ytrue::AbstractVector{<:Integer}, yscore::AbstractVector{<:AbstractFloat}; @@ -69,6 +73,8 @@ function probability_calibration_scores_and_fractions( return scores, fractions end +""" +""" function plot_probability_calibration_curve( estimator::Fittable, features_df::DataFrames.AbstractDataFrame, @@ -91,6 +97,8 @@ function plot_probability_calibration_curve( return result end +""" +""" function plot_probability_calibration_curve( scores::AbstractVector{<:AbstractFloat}, fractions::AbstractVector{<:AbstractFloat}, @@ -149,6 +157,8 @@ function plot_probability_calibration_curve( return tikzpicture end +""" +""" function probability_calibration_metrics( estimator::Fittable, features_df::DataFrames.AbstractDataFrame, @@ -169,6 +179,8 @@ function probability_calibration_metrics( return result end +""" +""" function probability_calibration_metrics( vectorofestimators::AbstractVector{Fittable}, features_df::DataFrames.AbstractDataFrame, diff --git a/src/postprocessing/packagemultilabelpred.jl b/src/postprocessing/packagemultilabelpred.jl index 9bd3dce68..7492edf0c 100644 --- a/src/postprocessing/packagemultilabelpred.jl +++ b/src/postprocessing/packagemultilabelpred.jl @@ -1,8 +1,12 @@ +""" +""" struct ImmutablePackageMultiLabelPredictionTransformer <: AbstractEstimator labelnames::T1 where T1 <: AbstractVector{Symbol} end +""" +""" function set_feature_contrasts!( x::ImmutablePackageMultiLabelPredictionTransformer, feature_contrasts::AbstractFeatureContrasts, @@ -10,6 +14,8 @@ function set_feature_contrasts!( return nothing end +""" +""" function get_underlying( x::ImmutablePackageMultiLabelPredictionTransformer; saving::Bool = false, @@ -18,6 +24,8 @@ function get_underlying( return nothing end +""" +""" function get_history( x::ImmutablePackageMultiLabelPredictionTransformer; saving::Bool = false, @@ -26,6 +34,8 @@ function get_history( return nothing end +""" +""" function fit!( transformer::ImmutablePackageMultiLabelPredictionTransformer, varargs...; @@ -38,6 +48,8 @@ function fit!( end end +""" +""" function predict( transformer::ImmutablePackageMultiLabelPredictionTransformer, singlelabelpredictions::AbstractMatrix, @@ -49,6 +61,8 @@ function predict( return result end +""" +""" function predict_proba( transformer::ImmutablePackageMultiLabelPredictionTransformer, varargs...; diff --git a/src/postprocessing/packagesinglelabelpred.jl b/src/postprocessing/packagesinglelabelpred.jl index ec211c7e1..8fccf64bb 100644 --- a/src/postprocessing/packagesinglelabelpred.jl +++ b/src/postprocessing/packagesinglelabelpred.jl @@ -1,8 +1,12 @@ +""" +""" struct ImmutablePackageSingleLabelPredictionTransformer <: AbstractEstimator singlelabelname::T1 where T1 <: Symbol end +""" +""" function set_feature_contrasts!( x::ImmutablePackageSingleLabelPredictionTransformer, feature_contrasts::AbstractFeatureContrasts, @@ -10,6 +14,8 @@ function set_feature_contrasts!( return nothing end +""" +""" function get_underlying( x::ImmutablePackageSingleLabelPredictionTransformer; saving::Bool = false, @@ -18,6 +24,8 @@ function get_underlying( return nothing end +""" +""" function get_history( x::ImmutablePackageSingleLabelPredictionTransformer; saving::Bool = false, @@ -26,6 +34,8 @@ function get_history( return nothing end +""" +""" function fit!( transformer::ImmutablePackageSingleLabelPredictionTransformer, varargs...; @@ -38,6 +48,8 @@ function fit!( end end +""" +""" function predict( transformer::ImmutablePackageSingleLabelPredictionTransformer, singlelabelpredictions::AbstractVector, @@ -48,6 +60,8 @@ function predict( return result end +""" +""" function predict_proba( transformer::ImmutablePackageSingleLabelPredictionTransformer, varargs...; diff --git a/src/postprocessing/packagesinglelabelproba.jl b/src/postprocessing/packagesinglelabelproba.jl index 2297821d9..807b0de73 100644 --- a/src/postprocessing/packagesinglelabelproba.jl +++ b/src/postprocessing/packagesinglelabelproba.jl @@ -1,8 +1,12 @@ +""" +""" struct ImmutablePackageSingleLabelPredictProbaTransformer <: AbstractEstimator singlelabelname::T1 where T1 <: Symbol end +""" +""" function set_feature_contrasts!( x::ImmutablePackageSingleLabelPredictProbaTransformer, feature_contrasts::AbstractFeatureContrasts, @@ -10,6 +14,8 @@ function set_feature_contrasts!( return nothing end +""" +""" function get_underlying( x::ImmutablePackageSingleLabelPredictProbaTransformer; saving::Bool = false, @@ -18,6 +24,8 @@ function get_underlying( return nothing end +""" +""" function get_history( x::ImmutablePackageSingleLabelPredictProbaTransformer; saving::Bool = false, @@ -26,6 +34,8 @@ function get_history( return nothing end +""" +""" function fit!( transformer::ImmutablePackageSingleLabelPredictProbaTransformer, varargs...; @@ -38,6 +48,8 @@ function fit!( end end +""" +""" function predict( transformer::ImmutablePackageSingleLabelPredictProbaTransformer, varargs...; @@ -50,6 +62,8 @@ function predict( end end +""" +""" function predict_proba( transformer::ImmutablePackageSingleLabelPredictProbaTransformer, singlelabelprobabilities::Associative; diff --git a/src/postprocessing/predictoutput.jl b/src/postprocessing/predictoutput.jl index 68032d68b..c11c8c515 100644 --- a/src/postprocessing/predictoutput.jl +++ b/src/postprocessing/predictoutput.jl @@ -1,11 +1,15 @@ import DataFrames +""" +""" struct ImmutablePredictionsSingleLabelInt2StringTransformer <: AbstractEstimator index::T1 where T1 <: Integer levels::T2 where T2 <: AbstractVector end +""" +""" function set_feature_contrasts!( x::ImmutablePredictionsSingleLabelInt2StringTransformer, feature_contrasts::AbstractFeatureContrasts, @@ -13,6 +17,8 @@ function set_feature_contrasts!( return nothing end +""" +""" function get_underlying( x::ImmutablePredictionsSingleLabelInt2StringTransformer; saving::Bool = false, @@ -21,6 +27,8 @@ function get_underlying( return nothing end +""" +""" function get_history( x::ImmutablePredictionsSingleLabelInt2StringTransformer; saving::Bool = false, @@ -29,6 +37,8 @@ function get_history( return nothing end +""" +""" function fit!( transformer::ImmutablePredictionsSingleLabelInt2StringTransformer, varargs...; @@ -41,6 +51,8 @@ function fit!( end end +""" +""" function predict( transformer::ImmutablePredictionsSingleLabelInt2StringTransformer, singlelabelpredictions::AbstractVector; @@ -58,6 +70,8 @@ function predict( return result end +""" +""" function predict( transformer::ImmutablePredictionsSingleLabelInt2StringTransformer, singlelabelpredictions::DataFrames.AbstractDataFrame; @@ -75,6 +89,8 @@ function predict( return result end +""" +""" function predict_proba( transformer::ImmutablePredictionsSingleLabelInt2StringTransformer, varargs...; diff --git a/src/postprocessing/predictprobaoutput.jl b/src/postprocessing/predictprobaoutput.jl index 1fbc2009e..89a018968 100644 --- a/src/postprocessing/predictprobaoutput.jl +++ b/src/postprocessing/predictprobaoutput.jl @@ -1,9 +1,13 @@ +""" +""" struct ImmutablePredictProbaSingleLabelInt2StringTransformer <: AbstractEstimator index::T1 where T1 <: Integer levels::T2 where T2 <: AbstractVector end +""" +""" function set_feature_contrasts!( x::ImmutablePredictProbaSingleLabelInt2StringTransformer, feature_contrasts::AbstractFeatureContrasts, @@ -11,6 +15,8 @@ function set_feature_contrasts!( return nothing end +""" +""" function get_underlying( x::ImmutablePredictProbaSingleLabelInt2StringTransformer; saving::Bool = false, @@ -19,6 +25,8 @@ function get_underlying( return nothing end +""" +""" function get_history( x::ImmutablePredictProbaSingleLabelInt2StringTransformer; saving::Bool = false, @@ -27,6 +35,8 @@ function get_history( return nothing end +""" +""" function fit!( transformer::ImmutablePredictProbaSingleLabelInt2StringTransformer, varargs...; @@ -39,6 +49,8 @@ function fit!( end end +""" +""" function predict( transformer::ImmutablePredictProbaSingleLabelInt2StringTransformer, varargs...; @@ -51,6 +63,8 @@ function predict( end end +""" +""" function predict_proba( transformer::ImmutablePredictProbaSingleLabelInt2StringTransformer, singlelabelprobabilities::Associative; diff --git a/src/preprocessing/dataframecontrasts.jl b/src/preprocessing/dataframecontrasts.jl index d62fb8049..f4b1810fb 100644 --- a/src/preprocessing/dataframecontrasts.jl +++ b/src/preprocessing/dataframecontrasts.jl @@ -1,5 +1,7 @@ import DataFrames +""" +""" struct DataFrameFeatureContrasts <: AbstractFeatureContrasts columns::T1 where T1 <: AbstractVector{Symbol} num_df_columns::T2 where T2 <: Integer @@ -7,6 +9,8 @@ struct DataFrameFeatureContrasts <: AbstractFeatureContrasts num_array_columns::T4 where T4 <: Integer end +""" +""" function DataFrameFeatureContrasts( df::DataFrames.AbstractDataFrame, columns::AbstractVector{Symbol}, @@ -34,6 +38,8 @@ function DataFrameFeatureContrasts( return result end +""" +""" function generate_feature_contrasts( df::DataFrames.AbstractDataFrame, columns::AbstractVector{Symbol}, diff --git a/src/preprocessing/dataframetodecisiontree.jl b/src/preprocessing/dataframetodecisiontree.jl index b5d32403c..92093b1ac 100644 --- a/src/preprocessing/dataframetodecisiontree.jl +++ b/src/preprocessing/dataframetodecisiontree.jl @@ -1,6 +1,8 @@ import DataFrames import StatsModels +""" +""" mutable struct MutableDataFrame2DecisionTreeTransformer <: AbstractEstimator featurenames::T1 where T1 <: AbstractVector @@ -21,6 +23,8 @@ mutable struct MutableDataFrame2DecisionTreeTransformer <: end end +""" +""" function set_feature_contrasts!( x::MutableDataFrame2DecisionTreeTransformer, feature_contrasts::AbstractFeatureContrasts, @@ -29,6 +33,8 @@ function set_feature_contrasts!( return nothing end +""" +""" function get_underlying( x::MutableDataFrame2DecisionTreeTransformer; saving::Bool = false, @@ -38,6 +44,8 @@ function get_underlying( return result end +""" +""" function get_history( x::MutableDataFrame2DecisionTreeTransformer; saving::Bool = false, @@ -46,6 +54,8 @@ function get_history( return nothing end +""" +""" function transform( transformer::MutableDataFrame2DecisionTreeTransformer, features_df::DataFrames.AbstractDataFrame, @@ -69,6 +79,8 @@ function transform( return featuresarray, labelsarray end +""" +""" function transform( transformer::MutableDataFrame2DecisionTreeTransformer, features_df::DataFrames.AbstractDataFrame; @@ -89,6 +101,8 @@ function transform( return featuresarray end +""" +""" function fit!( transformer::MutableDataFrame2DecisionTreeTransformer, features_df::DataFrames.AbstractDataFrame, @@ -98,6 +112,8 @@ function fit!( return transform(transformer, features_df, labels_df) end +""" +""" function predict( transformer::MutableDataFrame2DecisionTreeTransformer, features_df::DataFrames.AbstractDataFrame; @@ -106,6 +122,8 @@ function predict( return transform(transformer, features_df) end +""" +""" function predict_proba( transformer::MutableDataFrame2DecisionTreeTransformer, features_df::DataFrames.AbstractDataFrame; diff --git a/src/preprocessing/dataframetoglm.jl b/src/preprocessing/dataframetoglm.jl index 7f9a86af0..263763795 100644 --- a/src/preprocessing/dataframetoglm.jl +++ b/src/preprocessing/dataframetoglm.jl @@ -1,11 +1,15 @@ import DataFrames +""" +""" struct ImmutableDataFrame2GLMSingleLabelBinaryClassTransformer <: AbstractEstimator label::T1 where T1 <: Symbol positiveclass::T2 where T2 <: AbstractString end +""" +""" function set_feature_contrasts!( x::ImmutableDataFrame2GLMSingleLabelBinaryClassTransformer, feature_contrasts::AbstractFeatureContrasts, @@ -13,6 +17,8 @@ function set_feature_contrasts!( return nothing end +""" +""" function get_underlying( x::ImmutableDataFrame2GLMSingleLabelBinaryClassTransformer; saving::Bool = false, @@ -21,6 +27,8 @@ function get_underlying( return nothing end +""" +""" function get_history( x::ImmutableDataFrame2GLMSingleLabelBinaryClassTransformer; saving::Bool = false, @@ -29,6 +37,8 @@ function get_history( return nothing end +""" +""" function transform( transformer::ImmutableDataFrame2GLMSingleLabelBinaryClassTransformer, features_df::DataFrames.AbstractDataFrame, @@ -44,6 +54,8 @@ function transform( return features_df, transformedlabels_df end +""" +""" function transform( transformer::ImmutableDataFrame2GLMSingleLabelBinaryClassTransformer, features_df::DataFrames.AbstractDataFrame; @@ -52,6 +64,8 @@ function transform( return features_df end +""" +""" function fit!( transformer::ImmutableDataFrame2GLMSingleLabelBinaryClassTransformer, features_df::DataFrames.AbstractDataFrame, @@ -61,6 +75,8 @@ function fit!( return transform(transformer, features_df, labels_df) end +""" +""" function predict( transformer::ImmutableDataFrame2GLMSingleLabelBinaryClassTransformer, features_df::DataFrames.AbstractDataFrame; @@ -69,7 +85,8 @@ function predict( return transform(transformer, features_df) end - +""" +""" function predict_proba( transformer::ImmutableDataFrame2GLMSingleLabelBinaryClassTransformer, features_df::DataFrames.AbstractDataFrame; diff --git a/src/preprocessing/dataframetoknet.jl b/src/preprocessing/dataframetoknet.jl index 503a5112b..ffd7b90b7 100644 --- a/src/preprocessing/dataframetoknet.jl +++ b/src/preprocessing/dataframetoknet.jl @@ -1,6 +1,8 @@ import DataFrames import StatsModels +""" +""" mutable struct MutableDataFrame2ClassificationKnetTransformer <: AbstractEstimator featurenames::T1 where T1 <: AbstractVector @@ -30,6 +32,8 @@ mutable struct MutableDataFrame2ClassificationKnetTransformer <: end end +""" +""" mutable struct MutableDataFrame2RegressionKnetTransformer <: AbstractEstimator featurenames::T1 where T1 <: AbstractVector @@ -53,6 +57,8 @@ mutable struct MutableDataFrame2RegressionKnetTransformer <: end end +""" +""" function get_history( x::MutableDataFrame2ClassificationKnetTransformer; saving::Bool = false, @@ -61,6 +67,8 @@ function get_history( return nothing end +""" +""" function get_history( x::MutableDataFrame2RegressionKnetTransformer; saving::Bool = false, @@ -69,6 +77,8 @@ function get_history( return nothing end +""" +""" function get_underlying( x::MutableDataFrame2ClassificationKnetTransformer; saving::Bool = false, @@ -78,6 +88,8 @@ function get_underlying( return result end +""" +""" function get_underlying( x::MutableDataFrame2RegressionKnetTransformer; saving::Bool = false, @@ -87,6 +99,8 @@ function get_underlying( return result end +""" +""" function set_feature_contrasts!( x::MutableDataFrame2ClassificationKnetTransformer, feature_contrasts::AbstractFeatureContrasts, @@ -95,6 +109,8 @@ function set_feature_contrasts!( return nothing end +""" +""" function set_feature_contrasts!( x::MutableDataFrame2RegressionKnetTransformer, contrasts::AbstractFeatureContrasts, @@ -103,6 +119,8 @@ function set_feature_contrasts!( return nothing end +""" +""" function fit!( transformer::MutableDataFrame2ClassificationKnetTransformer, training_features_df::DataFrames.AbstractDataFrame, @@ -122,6 +140,8 @@ function fit!( return result end +""" +""" function fit!( transformer::MutableDataFrame2RegressionKnetTransformer, training_features_df::DataFrames.AbstractDataFrame, @@ -141,6 +161,8 @@ function fit!( return result end +""" +""" function predict( transformer::MutableDataFrame2ClassificationKnetTransformer, features_df::DataFrames.AbstractDataFrame; @@ -149,6 +171,8 @@ function predict( return transform(transformer, features_df) end +""" +""" function predict_proba( transformer::MutableDataFrame2ClassificationKnetTransformer, features_df::DataFrames.AbstractDataFrame; @@ -157,6 +181,8 @@ function predict_proba( return transform(transformer, features_df) end +""" +""" function predict( transformer::MutableDataFrame2RegressionKnetTransformer, features_df::DataFrames.AbstractDataFrame; @@ -170,6 +196,8 @@ function predict( return result end +""" +""" function predict_proba( transformer::MutableDataFrame2RegressionKnetTransformer, features_df::DataFrames.AbstractDataFrame; @@ -183,6 +211,8 @@ function predict_proba( return result end +""" +""" function transform( transformer::MutableDataFrame2ClassificationKnetTransformer, training_features_df::DataFrames.AbstractDataFrame, @@ -267,6 +297,8 @@ function transform( end end +""" +""" function transform( transformer::MutableDataFrame2ClassificationKnetTransformer, features_df::DataFrames.AbstractDataFrame; @@ -290,6 +322,8 @@ function transform( return featuresarray end +""" +""" function transform( transformer::MutableDataFrame2RegressionKnetTransformer, training_features_df::DataFrames.AbstractDataFrame, @@ -352,6 +386,8 @@ function transform( end end +""" +""" function transform( transformer::MutableDataFrame2RegressionKnetTransformer, features_df::DataFrames.AbstractDataFrame, diff --git a/src/preprocessing/dataframetosvm.jl b/src/preprocessing/dataframetosvm.jl index 691f995d0..4dc617492 100644 --- a/src/preprocessing/dataframetosvm.jl +++ b/src/preprocessing/dataframetosvm.jl @@ -1,10 +1,14 @@ import DataFrames import StatsModels +""" +""" struct ImmutableFeatureArrayTransposerTransformer <: AbstractEstimator end +""" +""" function set_feature_contrasts!( x::ImmutableFeatureArrayTransposerTransformer, feature_contrasts::AbstractFeatureContrasts, @@ -12,6 +16,8 @@ function set_feature_contrasts!( return nothing end +""" +""" function get_underlying( x::ImmutableFeatureArrayTransposerTransformer; saving::Bool = false, @@ -20,6 +26,8 @@ function get_underlying( return nothing end +""" +""" function get_history( x::ImmutableFeatureArrayTransposerTransformer; saving::Bool = false, @@ -28,6 +36,8 @@ function get_history( return nothing end +""" +""" function transform( transformer::ImmutableFeatureArrayTransposerTransformer, featuresarray::AbstractMatrix, @@ -38,6 +48,8 @@ function transform( return featuresarraytransposed, labelsarray end +""" +""" function transform( transformer::ImmutableFeatureArrayTransposerTransformer, featuresarray::AbstractMatrix; @@ -47,6 +59,8 @@ function transform( return featuresarraytransposed end +""" +""" function fit!( transformer::ImmutableFeatureArrayTransposerTransformer, featuresarray::AbstractMatrix, @@ -56,6 +70,8 @@ function fit!( return transform(transformer, featuresarray, labelsarray) end +""" +""" function predict( transformer::ImmutableFeatureArrayTransposerTransformer, featuresarray::AbstractMatrix; @@ -64,6 +80,8 @@ function predict( return transform(transformer, featuresarray) end +""" +""" function predict_proba( transformer::ImmutableFeatureArrayTransposerTransformer, featuresarray::AbstractMatrix; @@ -72,6 +90,8 @@ function predict_proba( return transform(transformer, featuresarray) end +""" +""" function DataFrame2LIBSVMTransformer( featurenames::AbstractVector, singlelabelname::Symbol; diff --git a/src/svm/libsvm.jl b/src/svm/libsvm.jl index 13814c9da..9a583c47d 100644 --- a/src/svm/libsvm.jl +++ b/src/svm/libsvm.jl @@ -1,5 +1,7 @@ import LIBSVM +""" +""" mutable struct LIBSVMModel <: AbstractEstimator name::T1 where T1 <: AbstractString isclassificationmodel::T2 where T2 <: Bool @@ -62,6 +64,8 @@ mutable struct LIBSVMModel <: AbstractEstimator end end +""" +""" function set_feature_contrasts!( x::LIBSVMModel, feature_contrasts::AbstractFeatureContrasts, @@ -69,6 +73,8 @@ function set_feature_contrasts!( return nothing end +""" +""" function get_underlying( x::LIBSVMModel; saving::Bool = false, @@ -78,6 +84,8 @@ function get_underlying( return result end +""" +""" function get_history( x::LIBSVMModel; saving::Bool = false, @@ -86,6 +94,8 @@ function get_history( return nothing end +""" +""" function fit!( estimator::LIBSVMModel, featuresarray::AbstractArray, @@ -122,6 +132,8 @@ function fit!( return estimator end +""" +""" function predict( estimator::LIBSVMModel, featuresarray::AbstractArray, @@ -153,6 +165,8 @@ function predict( end end +""" +""" function predict_proba( estimator::LIBSVMModel, featuresarray::AbstractArray, @@ -182,6 +196,8 @@ function predict_proba( end end +""" +""" function _singlelabelmulticlassdataframesvmclassifier_LIBSVM( featurenames::AbstractVector, singlelabelname::Symbol, @@ -248,6 +264,8 @@ function _singlelabelmulticlassdataframesvmclassifier_LIBSVM( return finalpipeline end +""" +""" function singlelabelmulticlassdataframesvmclassifier( featurenames::AbstractVector, singlelabelname::Symbol, @@ -296,6 +314,8 @@ function singlelabelmulticlassdataframesvmclassifier( end end +""" +""" function _singlelabeldataframesvmregression_LIBSVM( featurenames::AbstractVector, singlelabelname::Symbol; @@ -355,6 +375,8 @@ function _singlelabeldataframesvmregression_LIBSVM( return finalpipeline end +""" +""" function singlelabeldataframesvmregression( featurenames::AbstractVector, singlelabelname::Symbol, diff --git a/src/tree/decisiontree.jl b/src/tree/decisiontree.jl index 59e9e88bf..9598832d4 100644 --- a/src/tree/decisiontree.jl +++ b/src/tree/decisiontree.jl @@ -1,5 +1,7 @@ import DecisionTree +""" +""" mutable struct DecisionTreeModel <: AbstractEstimator name::T1 where T1 <: AbstractString @@ -42,6 +44,8 @@ mutable struct DecisionTreeModel <: end end +""" +""" function set_feature_contrasts!( x::DecisionTreeModel, feature_contrasts::AbstractFeatureContrasts, @@ -49,10 +53,14 @@ function set_feature_contrasts!( return nothing end +""" +""" function underlying(x::DecisionTreeModel) return nothing end +""" +""" function get_underlying( x::DecisionTreeModel; saving::Bool = false, @@ -62,6 +70,8 @@ function get_underlying( return result end +""" +""" function get_history( x::DecisionTreeModel; saving::Bool = false, @@ -70,6 +80,8 @@ function get_history( return nothing end +""" +""" function fit!( estimator::DecisionTreeModel, featuresarray::AbstractArray, @@ -97,6 +109,8 @@ function fit!( return estimator end +""" +""" function predict( estimator::DecisionTreeModel, featuresarray::AbstractArray, @@ -125,6 +139,8 @@ function predict( end end +""" +""" function predict_proba( estimator::DecisionTreeModel, featuresarray::AbstractArray, @@ -156,6 +172,8 @@ function predict_proba( end end +""" +""" function _singlelabelmulticlassdataframerandomforestclassifier_DecisionTree( featurenames::AbstractVector, singlelabelname::Symbol, @@ -200,6 +218,8 @@ function _singlelabelmulticlassdataframerandomforestclassifier_DecisionTree( return finalpipeline end +""" +""" function singlelabelmulticlassdataframerandomforestclassifier( featurenames::AbstractVector, singlelabelname::Symbol, @@ -226,6 +246,8 @@ function singlelabelmulticlassdataframerandomforestclassifier( end end +""" +""" function _singlelabeldataframerandomforestregression_DecisionTree( featurenames::AbstractVector, singlelabelname::Symbol; @@ -263,6 +285,8 @@ function _singlelabeldataframerandomforestregression_DecisionTree( return finalpipeline end +""" +""" function singlelabeldataframerandomforestregression( featurenames::AbstractVector, singlelabelname::Symbol; diff --git a/src/utils/fix_dict_type.jl b/src/utils/fix_dict_type.jl index 95c716ac9..8d66ba441 100644 --- a/src/utils/fix_dict_type.jl +++ b/src/utils/fix_dict_type.jl @@ -1,3 +1,5 @@ +""" +""" function fix_dict_type( x::Associative; default_key_type::Type = Symbol, diff --git a/src/utils/fix_vector_type.jl b/src/utils/fix_vector_type.jl index 879f409c5..78ce3842f 100644 --- a/src/utils/fix_vector_type.jl +++ b/src/utils/fix_vector_type.jl @@ -1,3 +1,5 @@ +""" +""" function fix_vector_type(x::AbstractVector) new_vector = [x...] return new_vector @@ -5,6 +7,8 @@ end fix_vector_type(x::Void) = x +""" +""" function fix_array_type(x::AbstractArray) new_array = reshape([x...], size(x)) return new_array diff --git a/src/utils/formulas.jl b/src/utils/formulas.jl index bf59e7474..2210d5d9a 100644 --- a/src/utils/formulas.jl +++ b/src/utils/formulas.jl @@ -1,8 +1,12 @@ import Combinatorics import StatsModels +""" +""" generate_formula(x::AbstractString) = generate_formula(parse(x)) +""" +""" function generate_formula(expression::Expr) original_expression = Meta.quot(copy(expression)) StatsModels.sort_terms!(StatsModels.parse!(expression)) @@ -23,6 +27,8 @@ function generate_formula(expression::Expr) return formula_7 end +""" +""" function generate_formula( dependent_variable::Symbol, independent_variables::AbstractVector{<:Symbol}; @@ -38,6 +44,8 @@ function generate_formula( return result end +""" +""" function generate_formula( dependent_variables::AbstractVector{<:Symbol}, independent_variables::AbstractVector{<:Symbol}; @@ -72,6 +80,8 @@ function generate_formula( return formula_object end +""" +""" function generate_interaction_terms( independent_variables::AbstractVector{<:Symbol}, interactions::Integer, diff --git a/src/utils/labelstringintmaps.jl b/src/utils/labelstringintmaps.jl index 487362dbd..8bbfed830 100644 --- a/src/utils/labelstringintmaps.jl +++ b/src/utils/labelstringintmaps.jl @@ -1,3 +1,5 @@ +""" +""" function _getlabelstring2intmap( levels::AbstractVector, index::Integer, @@ -13,6 +15,8 @@ function _getlabelstring2intmap( return result end +""" +""" function _getlabelint2stringmap( levels::AbstractVector, index::Integer, diff --git a/src/utils/nothings.jl b/src/utils/nothings.jl index e43abdb06..718e37be5 100644 --- a/src/utils/nothings.jl +++ b/src/utils/nothings.jl @@ -1,3 +1,5 @@ +""" +""" function delete_nothings!(x::AbstractVector) filter!(e->e≠nothing, x) return x @@ -5,3 +7,7 @@ end is_nothing(x::Void) = true is_nothing(x::Any) = false + +""" +""" +is_nothing diff --git a/src/utils/openbrowserwindow.jl b/src/utils/openbrowserwindow.jl index e9f312168..435182e17 100644 --- a/src/utils/openbrowserwindow.jl +++ b/src/utils/openbrowserwindow.jl @@ -1,5 +1,7 @@ # Originally based on https://github.com/JuliaPlots/Plots.jl/blob/master/src/backends/web.jl +""" +""" function open_browser_window(filename::AbstractString) if is_travis_ci(ENV) info(string("DEBUG Skipping opening file during Travis build: ",filename,)) diff --git a/src/utils/openplotsduringtestsenv.jl b/src/utils/openplotsduringtestsenv.jl index 77ed20e6b..959bf0461 100644 --- a/src/utils/openplotsduringtestsenv.jl +++ b/src/utils/openplotsduringtestsenv.jl @@ -1,3 +1,5 @@ +""" +""" function open_plots_during_tests(a::Associative) result = lowercase(get(a, "OPEN_PLOTS_DURING_TESTS", "")) == lowercase("true") diff --git a/src/utils/predictionsassoctodataframe.jl b/src/utils/predictionsassoctodataframe.jl index e2ee1074e..76955ed14 100644 --- a/src/utils/predictionsassoctodataframe.jl +++ b/src/utils/predictionsassoctodataframe.jl @@ -1,5 +1,7 @@ import DataFrames +""" +""" function predictionsassoctodataframe( probabilitiesassoc::Associative, labelnames::AbstractVector = [], diff --git a/src/utils/probabilitiestopredictions.jl b/src/utils/probabilitiestopredictions.jl index ec00f0d03..24cea7251 100644 --- a/src/utils/probabilitiestopredictions.jl +++ b/src/utils/probabilitiestopredictions.jl @@ -1,3 +1,5 @@ +""" +""" function multilabelprobabilitiestopredictions( probabilitiesassoc::Associative; floattype::Type = Cfloat, @@ -16,6 +18,8 @@ end const probabilitiestopredictions = multilabelprobabilitiestopredictions +""" +""" function singlelabelprobabilitiestopredictions( probabilitiesassoc::Associative; floattype::Type = Cfloat, diff --git a/src/utils/runtestsenv.jl b/src/utils/runtestsenv.jl index c713cf966..316c096f2 100644 --- a/src/utils/runtestsenv.jl +++ b/src/utils/runtestsenv.jl @@ -1,3 +1,5 @@ +""" +""" function is_runtests(a::Associative) result = lowercase(get(a, "PREDICTMD_RUNTESTS", "")) == lowercase("true") return result diff --git a/src/utils/shufflerows.jl b/src/utils/shufflerows.jl index 571cb1e6c..3b800e4d0 100644 --- a/src/utils/shufflerows.jl +++ b/src/utils/shufflerows.jl @@ -1,6 +1,8 @@ import DataFrames import StatsBase +""" +""" function shuffle_rows!( dataframe::DataFrames.AbstractDataFrame, ) @@ -8,6 +10,8 @@ function shuffle_rows!( return result end +""" +""" function shuffle_rows!( rng::AbstractRNG, dataframe::DataFrames.AbstractDataFrame, diff --git a/src/utils/simplemovingaverage.jl b/src/utils/simplemovingaverage.jl index 35cf1d77d..f5e5fe7ec 100644 --- a/src/utils/simplemovingaverage.jl +++ b/src/utils/simplemovingaverage.jl @@ -1,3 +1,5 @@ +""" +""" function simple_moving_average( x::AbstractVector, window::Integer, diff --git a/src/utils/tikzpictures.jl b/src/utils/tikzpictures.jl index f238fa4f6..a59066d12 100644 --- a/src/utils/tikzpictures.jl +++ b/src/utils/tikzpictures.jl @@ -1,5 +1,7 @@ import TikzPictures +""" +""" function save_plot(filename::AbstractString, tp::TikzPictures.TikzPicture) extension = lowercase(strip(splitext(filename)[2])) if extension == ".pdf" @@ -21,32 +23,44 @@ function save_plot(filename::AbstractString, tp::TikzPictures.TikzPicture) end end +""" +""" function save_plot_pdf(filename::AbstractString, tp::TikzPictures.TikzPicture) result = TikzPictures.save(TikzPictures.PDF(filename), tp) return result end +""" +""" function save_plot_tex(filename::AbstractString, tp::TikzPictures.TikzPicture) result = TikzPictures.save(TikzPictures.TEX(filename), tp) return result end +""" +""" function save_plot_tikz(filename::AbstractString, tp::TikzPictures.TikzPicture) result = TikzPictures.save(TikzPictures.TIKZ(filename), tp) return result end +""" +""" function save_plot_svg(filename::AbstractString, tp::TikzPictures.TikzPicture) result = TikzPictures.save(TikzPictures.SVG(filename), tp) return result end +""" +""" function open_plot(tp::TikzPictures.TikzPicture) tempsvgfilename = string(tempname(), ".svg") result = open_plot(tempsvgfilename, tp) return result end +""" +""" function open_plot(filename::AbstractString, tp::TikzPictures.TikzPicture) saveresult = save_plot_svg(filename, tp) openresult = open_browser_window(filename) diff --git a/src/utils/trapz.jl b/src/utils/trapz.jl index 76b195789..fda4f4b0b 100644 --- a/src/utils/trapz.jl +++ b/src/utils/trapz.jl @@ -1,6 +1,6 @@ import NumericalIntegration -doc""" +""" trapz(x,y) Compute the area under the curve of (x,y) points using the trapezoidal method. diff --git a/src/utils/traviscienv.jl b/src/utils/traviscienv.jl index 3eb617701..53c46e5b7 100644 --- a/src/utils/traviscienv.jl +++ b/src/utils/traviscienv.jl @@ -1,3 +1,5 @@ +""" +""" function is_travis_ci(a::Associative) result = (lowercase(get(a, "CI", "")) == lowercase("true")) && (lowercase(get(a, "TRAVIS", "")) == lowercase("true")) && From 3660020524cf0e77f75efb876a862c71ad06b149 Mon Sep 17 00:00:00 2001 From: Dilum Aluthge Date: Mon, 21 May 2018 17:19:05 -0400 Subject: [PATCH 23/62] Progress commit --- docs/deploy_docs.jl | 14 + docs/make_docs.jl | 1 + docs/mkdocs.yml | 6 +- docs/src/examples.md | 0 docs/test_examples.jl | 11 + examples_old/boston_housing.jl | 603 ---- .../boston_housing_linear_regression.ipynb | 1561 ---------- .../boston_housing_linear_regression.jl | 149 - .../boston_housing_metric_comparison.ipynb | 382 --- .../boston_housing_metric_comparison.jl | 214 -- .../boston_housing/boston_housing_mlp.ipynb | 2619 ----------------- .../boston_housing/boston_housing_mlp.jl | 263 -- .../boston_housing_random_forest.ipynb | 1534 ---------- .../boston_housing_random_forest.jl | 143 - .../boston_housing/boston_housing_svm.ipynb | 2488 ---------------- .../boston_housing/boston_housing_svm.jl | 200 -- examples_old/breast_cancer_biopsy.jl | 928 ------ test/runtests.jl | 19 +- 18 files changed, 39 insertions(+), 11096 deletions(-) create mode 100644 docs/src/examples.md create mode 100644 docs/test_examples.jl delete mode 100644 examples_old/boston_housing.jl delete mode 100644 examples_old/boston_housing/boston_housing_linear_regression.ipynb delete mode 100644 examples_old/boston_housing/boston_housing_linear_regression.jl delete mode 100644 examples_old/boston_housing/boston_housing_metric_comparison.ipynb delete mode 100644 examples_old/boston_housing/boston_housing_metric_comparison.jl delete mode 100644 examples_old/boston_housing/boston_housing_mlp.ipynb delete mode 100644 examples_old/boston_housing/boston_housing_mlp.jl delete mode 100644 examples_old/boston_housing/boston_housing_random_forest.ipynb delete mode 100644 examples_old/boston_housing/boston_housing_random_forest.jl delete mode 100644 examples_old/boston_housing/boston_housing_svm.ipynb delete mode 100644 examples_old/boston_housing/boston_housing_svm.jl delete mode 100644 examples_old/breast_cancer_biopsy.jl diff --git a/docs/deploy_docs.jl b/docs/deploy_docs.jl index c95e30b6c..3c70459fe 100644 --- a/docs/deploy_docs.jl +++ b/docs/deploy_docs.jl @@ -1,2 +1,16 @@ import Documenter import PredictMD + +Documenter.deploydocs( + branch = "gh-pages", + deps = Documenter.Deps.pip( + "pygments", + "mkdocs", + "python-markdown-math", + ), + julia = "0.6", + latest = "develop", # latest = develop branch + osname = "linux", + repo = "github.com/bcbi/PredictMD.jl.git", + target = "site", + ) diff --git a/docs/make_docs.jl b/docs/make_docs.jl index 39c6fd454..25ee0804a 100644 --- a/docs/make_docs.jl +++ b/docs/make_docs.jl @@ -1,4 +1,5 @@ import Documenter +import Literate import PredictMD Documenter.makedocs( diff --git a/docs/mkdocs.yml b/docs/mkdocs.yml index 8e1b87574..453cd9ee5 100644 --- a/docs/mkdocs.yml +++ b/docs/mkdocs.yml @@ -1,7 +1,7 @@ site_name: PredictMD.jl repo_url: https://github.com/bcbi/PredictMD.jl -site_description: Uniform interface for machine learning in Julia -site_author: Center for Biomedical Informatics, Brown University +site_description: Uniform interface for machine learning in Julia +site_author: Center for Biomedical Informatics, Brown University theme: readthedocs @@ -22,6 +22,6 @@ docs_dir: 'build' pages: - Home: index.md + - Examples: examples.md - Library: - 'Internals': 'library/internals.md' - diff --git a/docs/src/examples.md b/docs/src/examples.md new file mode 100644 index 000000000..e69de29bb diff --git a/docs/test_examples.jl b/docs/test_examples.jl new file mode 100644 index 000000000..f9d74cfac --- /dev/null +++ b/docs/test_examples.jl @@ -0,0 +1,11 @@ +import Base.Test + +Base.Test.@testset "Test examples (CPU)" begin + info("INFO testing examples (CPU)") + Base.Test.@testset "Boston housing regression" begin + # include("") + end + Base.Test.@testset "Breast cancer biopsy classification" begin + # include("") + end +end diff --git a/examples_old/boston_housing.jl b/examples_old/boston_housing.jl deleted file mode 100644 index fc0babb6f..000000000 --- a/examples_old/boston_housing.jl +++ /dev/null @@ -1,603 +0,0 @@ -############################################################################## -############################################################################## -### INSTRUCTIONS FOR USING THIS FILE: ######################################## -############################################################################## -############################################################################## -## -## If you are running this file for the first time and/or if you do not have -## any trained models saved to disk, take the following steps: -## 1. Uncomment lines 27 and 28 -## 2. Comment out lines 30 and 31 -## 3. Set the variables on lines 33 through 37 to the filenames where you -## would like to save your models after training them. -## 4. Run the entire file. This will train the models, compare their -## performance, print metrics to the console, generate plots, and save -## the trained models to disk. -## -## If you already have trained models saved, and you would like to load those -## models from disk, take the following steps: -## 1. Comment out lines 27 and 28 -## 2. Uncomment lines 30 and 31 -## 3. Make sure the variables on lines 33 through 37 are set to the -## filenames where your trained models are currently saved. -## 4. Run the entire file. This will load the trained models from disk, -## compare their performance, print metrics to the console, and generate -## plots. - -# ENV["LOADTRAINEDMODELSFROMFILE"] = "false" -# ENV["SAVETRAINEDMODELSTOFILE"] = "true" - -# ENV["LOADTRAINEDMODELSFROMFILE"] = "true" -# ENV["SAVETRAINEDMODELSTOFILE"] = "false" - -linearreg_filename = "/Users/dilum/Desktop/linearreg.jld2" -randomforestreg_filename = "/Users/dilum/Desktop/randomforestreg.jld2" -epsilonsvr_svmreg_filename = "/Users/dilum/Desktop/epsilonsvr_svmreg.jld2" -nusvr_svmreg_filename = "/Users/dilum/Desktop/nusvr_svmreg.jld2" -knetmlpreg_filename = "/Users/dilum/Desktop/knetmlpreg.jld2" - -############################################################################## -############################################################################## -### Section 1: Setup ######################################################### -############################################################################## -############################################################################## - -# import required packages -import PredictMD -import CSV -import DataFrames -import GZip -import Knet -import LIBSVM -import StatsBase - -# set the seed of the global random number generator -# this makes the results reproducible -srand(999) - -############################################################################## -############################################################################## -### Section 2: Prepare data ################################################## -############################################################################## -############################################################################## - -# Import Boston housing data -df = CSV.read( - GZip.gzopen(joinpath(Pkg.dir("RDatasets"),"data","MASS","Boston.csv.gz")), - DataFrames.DataFrame, - ) - -# Remove rows with missing data -DataFrames.dropmissing!(df) - -# Shuffle rows -PredictMD.shuffle_rows!(df) - -# Define labels -categoricalfeaturenames = Symbol[] -continuousfeaturenames = Symbol[ - :Crim, - :Zn, - :Indus, - :Chas, - :NOx, - :Rm, - :Age, - :Dis, - :Rad, - :Tax, - :PTRatio, - :Black, - :LStat, - ] -featurenames = vcat(categoricalfeaturenames, continuousfeaturenames) - -if get(ENV, "LOADTRAINEDMODELSFROMFILE", "") == "true" -else - feature_contrasts = PredictMD.feature_contrasts(df, featurenames) -end - -# Define labels -labelname = :MedV - -# Put features and labels in separate dataframes -features_df = df[featurenames] -labels_df = df[[labelname]] - -# View summary statistics for label variable (mean, quartiles, etc.) -DataFrames.describe(labels_df[labelname]) - -# Split data into training set (70%) and testing set (30%) -training_features_df,testing_features_df,traininglabels_df,testing_labels_df = - PredictMD.split_data(features_df,labels_df,0.7) - -############################################################################## -############################################################################## -### Section 3: Set up and train models ####################################### -############################################################################## -############################################################################## - -############################################################################## -## Linear regression ######################################################### -############################################################################## - -# Set up linear regression model -linearreg = PredictMD.singlelabeldataframelinearregression( - featurenames, - labelname; - package = :GLMjl, - intercept = true, # optional, defaults to true - name = "Linear regression", # optional - ) - -if get(ENV, "LOADTRAINEDMODELSFROMFILE", "") == "true" - PredictMD.load!(linearreg_filename, linearreg) -else - # set feature contrasts - PredictMD.set_feature_contrasts!(linearreg, feature_contrasts) - # Train linear regression model - PredictMD.fit!(linearreg,training_features_df,traininglabels_df,) -end - -# View coefficients, p values, etc. for underlying linear regression -PredictMD.get_underlying(linearreg) - -# Plot true values versus predicted values for linear regression on training set -linearreg_plot_training = PredictMD.plotsinglelabelregressiontrueversuspredicted( - linearreg, - training_features_df, - traininglabels_df, - labelname, - ) -PredictMD.open(linearreg_plot_training) - -# Plot true values versus predicted values for linear regression on testing set -linearreg_plot_testing = PredictMD.plotsinglelabelregressiontrueversuspredicted( - linearreg, - testing_features_df, - testing_labels_df, - labelname - ) -PredictMD.open(linearreg_plot_testing) - -# Evaluate performance of linear regression on training set -PredictMD.singlelabelregressionmetrics( - linearreg, - training_features_df, - traininglabels_df, - labelname, - ) - -# Evaluate performance of linear regression on testing set -PredictMD.singlelabelregressionmetrics( - linearreg, - testing_features_df, - testing_labels_df, - labelname, - ) - -############################################################################## -## Random forest regression ################################################## -############################################################################## - -# Set up random forest regression model -randomforestreg = PredictMD.singlelabeldataframerandomforestregression( - featurenames, - labelname; - nsubfeatures = 2, # number of subfeatures; defaults to 2 - ntrees = 20, # number of trees; defaults to 10 - package = :DecisionTreejl, - name = "Random forest" # optional - ) - -if get(ENV, "LOADTRAINEDMODELSFROMFILE", "") == "true" - PredictMD.load!(randomforestreg_filename, randomforestreg) -else - # set feature contrasts - PredictMD.set_feature_contrasts!(randomforestreg, feature_contrasts) - # Train random forest model on training set - PredictMD.fit!(randomforestreg,training_features_df,traininglabels_df,) -end - -# Plot true values versus predicted values for random forest on training set -randomforestreg_plot_training = PredictMD.plotsinglelabelregressiontrueversuspredicted( - randomforestreg, - training_features_df, - traininglabels_df, - labelname, - ) -PredictMD.open(randomforestreg_plot_training) - -# Plot true values versus predicted values for random forest on testing set -randomforestreg_plot_testing = PredictMD.plotsinglelabelregressiontrueversuspredicted( - randomforestreg, - testing_features_df, - testing_labels_df, - labelname, - ) -PredictMD.open(randomforestreg_plot_testing) - -# Evaluate performance of random forest on training set -PredictMD.singlelabelregressionmetrics( - randomforestreg, - training_features_df, - traininglabels_df, - labelname, - ) - -# Evaluate performance of random forest on testing set -PredictMD.singlelabelregressionmetrics( - randomforestreg, - testing_features_df, - testing_labels_df, - labelname, - ) - -############################################################################## -## Support vector machine (epsilon support vector regression) ################ -############################################################################## - -# Set up epsilon-SVR model -epsilonsvr_svmreg = PredictMD.singlelabeldataframesvmregression( - featurenames, - labelname; - package = :LIBSVMjl, - svmtype = LIBSVM.EpsilonSVR, - name = "SVM (epsilon-SVR)", - kernel = LIBSVM.Kernel.Linear, - verbose = false, - ) - -if get(ENV, "LOADTRAINEDMODELSFROMFILE", "") == "true" - PredictMD.load!(epsilonsvr_svmreg_filename, epsilonsvr_svmreg) -else - # set feature contrasts - PredictMD.set_feature_contrasts!(epsilonsvr_svmreg, feature_contrasts) - # Train epsilon-SVR model on training set - PredictMD.fit!(epsilonsvr_svmreg,training_features_df,traininglabels_df,) -end - -# Plot true values versus predicted values for epsilon-SVR on training set -epsilonsvr_svmreg_plot_training = PredictMD.plotsinglelabelregressiontrueversuspredicted( - epsilonsvr_svmreg, - training_features_df, - traininglabels_df, - labelname, - ) -PredictMD.open(epsilonsvr_svmreg_plot_training) - -# Plot true values versus predicted values for epsilon-SVR on testing set -epsilonsvr_svmreg_plot_testing = PredictMD.plotsinglelabelregressiontrueversuspredicted( - epsilonsvr_svmreg, - testing_features_df, - testing_labels_df, - labelname, - ) -PredictMD.open(epsilonsvr_svmreg_plot_testing) - -# Evaluate performance of epsilon-SVR on training set -PredictMD.singlelabelregressionmetrics( - epsilonsvr_svmreg, - training_features_df, - traininglabels_df, - labelname, - ) - -# Evaluate performance of epsilon-SVR on testing set -PredictMD.singlelabelregressionmetrics( - epsilonsvr_svmreg, - testing_features_df, - testing_labels_df, - labelname, - ) - -############################################################################## -## Support vector machine (nu support vector regression) ################ -############################################################################## - -# Set up nu-SVR model -nusvr_svmreg = PredictMD.singlelabeldataframesvmregression( - featurenames, - labelname; - package = :LIBSVMjl, - svmtype = LIBSVM.NuSVR, - name = "SVM (nu-SVR)", - kernel = LIBSVM.Kernel.Linear, - verbose = false, - ) - -if get(ENV, "LOADTRAINEDMODELSFROMFILE", "") == "true" - PredictMD.load!(nusvr_svmreg_filename, nusvr_svmreg) -else - # set feature contrasts - PredictMD.set_feature_contrasts!(nusvr_svmreg, feature_contrasts) - # Train nu-SVR model - PredictMD.fit!(nusvr_svmreg,training_features_df,traininglabels_df,) -end - -# Plot true values versus predicted values for nu-SVR on training set -nusvr_svmreg_plot_training = PredictMD.plotsinglelabelregressiontrueversuspredicted( - nusvr_svmreg, - training_features_df, - traininglabels_df, - labelname, - ) -PredictMD.open(nusvr_svmreg_plot_training) - -# Plot true values versus predicted values for nu-SVR on testing set -nusvr_svmreg_plot_testing = PredictMD.plotsinglelabelregressiontrueversuspredicted( - nusvr_svmreg, - testing_features_df, - testing_labels_df, - labelname, - ) -PredictMD.open(nusvr_svmreg_plot_testing) - -# Evaluate performance of nu-SVR on training set -PredictMD.singlelabelregressionmetrics( - nusvr_svmreg, - training_features_df, - traininglabels_df, - labelname, - ) - -# Evaluate performance of nu-SVR on testing set -PredictMD.singlelabelregressionmetrics( - nusvr_svmreg, - testing_features_df, - testing_labels_df, - labelname, - ) - -############################################################################## -## Multilayer perceptron (i.e. fully connected feedforward neural network) ### -############################################################################## - -# Define predict function -function knetmlp_predict( - w, # don't put a type annotation on this - x0::AbstractArray; - training::Bool = false, - ) - # x0 = input layer - # x1 = hidden layer - x1 = Knet.relu.( w[1]*x0 .+ w[2] ) # w[1] = weights, w[2] = biases - # x2 = output layer - x2 = w[3]*x1 .+ w[4] # w[3] = weights, w[4] = biases - return x2 -end - -if get(ENV, "LOADTRAINEDMODELSFROMFILE", "") == "true" - # No need to initialize weights since we are going to load them from file - knetmlp_modelweights = Any[] -else - # Randomly initialize model weights - knetmlp_modelweights = Any[ - # input layer has dimension contrasts.num_array_columns - # - # hidden layer (10 neurons): - Cfloat.( - 0.1f0*randn(Cfloat,10,feature_contrasts.num_array_columns) # weights - ), - Cfloat.( - zeros(Cfloat,10,1) # biases - ), - # - # output layer (regression nets have exactly 1 neuron in output layer): - Cfloat.( - 0.1f0*randn(Cfloat,1,10) # weights - ), - Cfloat.( - zeros(Cfloat,1,1) # biases - ), - ] -end - -# Define loss function -function knetmlp_loss( - predict::Function, - modelweights, # don't put a type annotation on this - x::AbstractArray, - ytrue::AbstractArray; - L1::Real = Cfloat(0), - L2::Real = Cfloat(0), - ) - loss = mean( - abs2, - ytrue - predict(modelweights, x), - ) - if L1 != 0 - loss += L1 * sum(sum(abs, w_i) for w_i in modelweights[1:2:end]) - end - if L2 != 0 - loss += L2 * sum(sum(abs2, w_i) for w_i in modelweights[1:2:end]) - end - return loss -end - -# Define loss hyperparameters -knetmlp_losshyperparameters = Dict() -knetmlp_losshyperparameters[:L1] = Cfloat(0.0) -knetmlp_losshyperparameters[:L2] = Cfloat(0.0) - -# Select optimization algorithm -knetmlp_optimizationalgorithm = :Adam - -# Set optimization hyperparameters -knetmlp_optimizerhyperparameters = Dict() - -# Set the minibatch size -knetmlp_minibatchsize = 48 - -# Set the max number of epochs. After training, look at the learning curve. If -# it looks like the model has not yet converged, raise maxepochs. If it looks -# like the loss has hit a plateau and you are worried about overfitting, lower -# maxepochs. -knetmlp_maxepochs = 500 - -# Set up multilayer perceptron model -knetmlpreg = PredictMD.singlelabeldataframeknetregression( - featurenames, - labelname; - package = :Knetjl, - name = "Knet MLP", - predict = knetmlp_predict, - loss = knetmlp_loss, - losshyperparameters = knetmlp_losshyperparameters, - optimizationalgorithm = knetmlp_optimizationalgorithm, - optimizerhyperparameters = knetmlp_optimizerhyperparameters, - minibatchsize = knetmlp_minibatchsize, - modelweights = knetmlp_modelweights, - maxepochs = knetmlp_maxepochs, - printlosseverynepochs = 100, # if 0, will not print at all - ) - -if get(ENV, "LOADTRAINEDMODELSFROMFILE", "") == "true" - PredictMD.load!(knetmlpreg_filename, knetmlpreg) -else - # set feature contrasts - PredictMD.set_feature_contrasts!(knetmlpreg, feature_contrasts) - # Train multilayer perceptron model on training set - PredictMD.fit!(knetmlpreg,training_features_df,traininglabels_df,) -end - -# Plot learning curve: loss vs. epoch -knet_learningcurve_lossvsepoch = PredictMD.plotlearningcurve( - knetmlpreg, - :lossvsepoch; - ) -PredictMD.open(knet_learningcurve_lossvsepoch) - -# Plot learning curve: loss vs. epoch, skip the first 10 epochs -knet_learningcurve_lossvsepoch_skip10epochs = PredictMD.plotlearningcurve( - knetmlpreg, - :lossvsepoch; - startat = 10, - endat = :end, - ) -PredictMD.open(knet_learningcurve_lossvsepoch_skip10epochs) - -# Plot learning curve: loss vs. iteration -knet_learningcurve_lossvsiteration = PredictMD.plotlearningcurve( - knetmlpreg, - :lossvsiteration; - window = 50, - sampleevery = 10, - ) -PredictMD.open(knet_learningcurve_lossvsiteration) - -# Plot learning curve: loss vs. iteration, skip the first 100 iterations -knet_learningcurve_lossvsiteration_skip100iterations = PredictMD.plotlearningcurve( - knetmlpreg, - :lossvsiteration; - window = 50, - sampleevery = 10, - startat = 100, - endat = :end, - ) -PredictMD.open(knet_learningcurve_lossvsiteration_skip100iterations) - -# Plot true values versus predicted values for multilayer perceptron on training set -knetmlpreg_plot_training = PredictMD.plotsinglelabelregressiontrueversuspredicted( - knetmlpreg, - training_features_df, - traininglabels_df, - labelname, - ) -PredictMD.open(knetmlpreg_plot_training) - -# Plot true values versus predicted values for multilayer perceptron on testing set -knetmlpreg_plot_testing = PredictMD.plotsinglelabelregressiontrueversuspredicted( - knetmlpreg, - testing_features_df, - testing_labels_df, - labelname, - ) -PredictMD.open(knetmlpreg_plot_testing) - -# Evaluate performance of multilayer perceptron on training set -PredictMD.singlelabelregressionmetrics( - knetmlpreg, - training_features_df, - traininglabels_df, - labelname, - ) - -# Evaluate performance of multilayer perceptron on testing set -PredictMD.singlelabelregressionmetrics( - knetmlpreg, - testing_features_df, - testing_labels_df, - labelname, - ) - -############################################################################## -############################################################################## -### Section 4: Compare performance of all models ############################## -############################################################################## -############################################################################## - -# Compare performance of all five models on training set -showall(PredictMD.singlelabelregressionmetrics( - [ - linearreg, - randomforestreg, - epsilonsvr_svmreg, - nusvr_svmreg, - knetmlpreg, - ], - training_features_df, - traininglabels_df, - labelname, - )) - -# Compare performance of all models on testing set -showall(PredictMD.singlelabelregressionmetrics( - [ - linearreg, - randomforestreg, - epsilonsvr_svmreg, - nusvr_svmreg, - knetmlpreg, - ], - testing_features_df, - testing_labels_df, - labelname, - )) - -############################################################################## -############################################################################## -### Section 5: Save trained models to file (if desired) ####################### -############################################################################## -############################################################################## - -if get(ENV, "SAVETRAINEDMODELSTOFILE", "") == "true" - PredictMD.save(linearreg_filename, linearreg) - PredictMD.save(randomforestreg_filename, randomforestreg) - PredictMD.save(epsilonsvr_svmreg_filename, epsilonsvr_svmreg) - PredictMD.save(nusvr_svmreg_filename, nusvr_svmreg) - PredictMD.save(knetmlpreg_filename, knetmlpreg) -end - -############################################################################## -############################################################################## -## Appendix A: Directly access the output of regression models ############### -############################################################################## -############################################################################## - -# We can use the PredictMD.predict() function to get the real-valued predictions -# output by each of regression models. - -# Get real-valued predictions from each model for training set -PredictMD.predict(linearreg,training_features_df,) -PredictMD.predict(randomforestreg,training_features_df,) -PredictMD.predict(epsilonsvr_svmreg,training_features_df,) -PredictMD.predict(nusvr_svmreg,training_features_df,) -PredictMD.predict(knetmlpreg,training_features_df,) - -# Get real-valued predictions from each model for testing set -PredictMD.predict(linearreg,testing_features_df,) -PredictMD.predict(randomforestreg,testing_features_df,) -PredictMD.predict(epsilonsvr_svmreg,testing_features_df,) -PredictMD.predict(nusvr_svmreg,testing_features_df,) -PredictMD.predict(knetmlpreg,testing_features_df,) diff --git a/examples_old/boston_housing/boston_housing_linear_regression.ipynb b/examples_old/boston_housing/boston_housing_linear_regression.ipynb deleted file mode 100644 index c7c5b8a71..000000000 --- a/examples_old/boston_housing/boston_housing_linear_regression.ipynb +++ /dev/null @@ -1,1561 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Single-Label Random Forest Regression" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Section 1: Setup " - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "\u001b[1m\u001b[36mINFO: \u001b[39m\u001b[22m\u001b[36mPrecompiling module PredictMD.\n", - "\u001b[39mWARNING: --output requested, but no modules defined during run\n", - "\u001b[1m\u001b[33mWARNING: \u001b[39m\u001b[22m\u001b[33mThe call to compilecache failed to create a usable precompiled cache file for module FFTW. Got:\u001b[39m\n", - "\u001b[1m\u001b[33mWARNING: \u001b[39m\u001b[22m\u001b[33mCache file \"/Users/mrestrep/.julia/lib/v0.6/FFTW.ji\" not found.\u001b[39m\n", - "WARNING: eval from module Main to ImageCore: \n", - "Expr(:call, Expr(:., :Base, :include_from_node1)::Any, \"/Users/mrestrep/.julia/v0.6/FFTW/src/FFTW.jl\")::Any\n", - " ** incremental compilation may be broken for this module **\n", - "\n" - ] - }, - { - "data": { - "text/plain": [ - "MersenneTwister(UInt32[0x000003e7], Base.dSFMT.DSFMT_state(Int32[-412893719, 1072748155, -748568654, 1073610384, -1271302057, 1073556021, -429186579, 1073162675, 932796209, 1073458022 … 1115928124, 1073598513, 1280798571, 1072732908, -581554620, 1977796709, 1774936613, -1100988421, 382, 0]), [1.55164, 1.487, 1.85318, 1.95024, 1.20614, 1.39979, 1.29654, 1.25105, 1.08051, 1.08548 … 1.88474, 1.46801, 1.2853, 1.73083, 1.25907, 1.30943, 1.89382, 1.13456, 1.37224, 1.63709], 382)" - ] - }, - "execution_count": 1, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# import required packages\n", - "import PredictMD\n", - "import CSV\n", - "import DataFrames\n", - "import GZip\n", - "import StatsBase\n", - "\n", - "# set the seed of the global random number generator\n", - "# this makes the results reproducible\n", - "srand(999)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Running for the first time\n", - "\n", - "If you are running this file for the first time and/or if you do not have\n", - "any trained models saved to disk, uncomment the lines below to train a model and save it to disk" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "true" - ] - }, - "execution_count": 2, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "load_pretrained = false\n", - "save_trained = true" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Using a pre-trained model\n", - "\n", - "If you already have trained models saved, and you would like to load those. Uncomment the lines below" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [], - "source": [ - "# load_pretrained = true\n", - "# save_trained = false" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Set your paths" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "\"./knetmlpreg.jld2\"" - ] - }, - "execution_count": 4, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "linearreg_filename = \"./linearreg.jld2\"\n", - "randomforestreg_filename = \"./randomforestreg.jld2\"\n", - "epsilonsvr_svmreg_filename = \"./epsilonsvr_svmreg.jld2\"\n", - "nusvr_svmreg_filename = \"./nusvr_svmreg.jld2\"\n", - "knetmlpreg_filename = \"./knetmlpreg.jld2\"" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Section 2: Prepare data " - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
CrimZnIndusChasNOxRmAgeDisRadTaxPTRatioBlackLStatMedV
10.0063218.02.3100.5386.57565.24.09129615.3396.94.9824.0
20.027310.07.0700.4696.42178.94.9671224217.8396.99.1421.6
30.027290.07.0700.4697.18561.14.9671224217.8392.834.0334.7
40.032370.02.1800.4586.99845.86.0622322218.7394.632.9433.4
50.069050.02.1800.4587.14754.26.0622322218.7396.95.3336.2
60.029850.02.1800.4586.4358.76.0622322218.7394.125.2128.7
" - ], - "text/plain": [ - "6×14 DataFrames.DataFrame. Omitted printing of 5 columns\n", - "│ Row │ Crim │ Zn │ Indus │ Chas │ NOx │ Rm │ Age │ Dis │ Rad │\n", - "├─────┼─────────┼──────┼───────┼──────┼───────┼───────┼──────┼────────┼─────┤\n", - "│ 1 │ 0.00632 │ 18.0 │ 2.31 │ 0 │ 0.538 │ 6.575 │ 65.2 │ 4.09 │ 1 │\n", - "│ 2 │ 0.02731 │ 0.0 │ 7.07 │ 0 │ 0.469 │ 6.421 │ 78.9 │ 4.9671 │ 2 │\n", - "│ 3 │ 0.02729 │ 0.0 │ 7.07 │ 0 │ 0.469 │ 7.185 │ 61.1 │ 4.9671 │ 2 │\n", - "│ 4 │ 0.03237 │ 0.0 │ 2.18 │ 0 │ 0.458 │ 6.998 │ 45.8 │ 6.0622 │ 3 │\n", - "│ 5 │ 0.06905 │ 0.0 │ 2.18 │ 0 │ 0.458 │ 7.147 │ 54.2 │ 6.0622 │ 3 │\n", - "│ 6 │ 0.02985 │ 0.0 │ 2.18 │ 0 │ 0.458 │ 6.43 │ 58.7 │ 6.0622 │ 3 │" - ] - }, - "execution_count": 5, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# Import Boston housing data\n", - "df = CSV.read(\n", - " GZip.gzopen(joinpath(Pkg.dir(\"RDatasets\"),\"data\",\"MASS\",\"Boston.csv.gz\")),\n", - " DataFrames.DataFrame,\n", - " )\n", - "\n", - "#take a quick look at file header and few rows\n", - "DataFrames.head(df)" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "13-element Array{Symbol,1}:\n", - " :Crim \n", - " :Zn \n", - " :Indus \n", - " :Chas \n", - " :NOx \n", - " :Rm \n", - " :Age \n", - " :Dis \n", - " :Rad \n", - " :Tax \n", - " :PTRatio\n", - " :Black \n", - " :LStat " - ] - }, - "execution_count": 6, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# Remove rows with missing data\n", - "DataFrames.dropmissing!(df)\n", - "\n", - "# Shuffle rows\n", - "PredictMD.shuffle_rows!(df)\n", - "\n", - "# Define labels\n", - "categoricalfeaturenames = Symbol[]\n", - "\n", - "continuousfeaturenames = Symbol[\n", - " :Crim,\n", - " :Zn,\n", - " :Indus,\n", - " :Chas,\n", - " :NOx,\n", - " :Rm,\n", - " :Age,\n", - " :Dis,\n", - " :Rad,\n", - " :Tax,\n", - " :PTRatio,\n", - " :Black,\n", - " :LStat,\n", - " ]\n", - "featurenames = vcat(categoricalfeaturenames, continuousfeaturenames)" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "PredictMD.ImmutableDataFrameFeatureContrasts(Symbol[:Crim, :Zn, :Indus, :Chas, :NOx, :Rm, :Age, :Dis, :Rad, :Tax, :PTRatio, :Black, :LStat], 13, Dict{Symbol,StatsModels.ContrastsMatrix}(), 13)" - ] - }, - "execution_count": 7, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "if load_pretrained\n", - "else\n", - " contrasts = PredictMD.contrasts(df, featurenames)\n", - "end" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
CrimZnIndusChasNOxRmAgeDisRadTaxPTRatioBlackLStat
188.97620.018.100.6716.96891.91.41652466620.2396.917.21
20.060470.02.4600.4886.15368.83.2797319317.8387.1113.15
30.17120.08.5600.525.83691.92.211538420.9395.6718.66
40.544520.021.8900.6246.15197.91.6687443721.2396.918.46
50.0346635.06.0600.43796.03123.36.6407130416.9362.257.83
60.0200995.02.6800.41618.03431.95.118422414.7390.552.88
" - ], - "text/plain": [ - "6×13 DataFrames.DataFrame. Omitted printing of 4 columns\n", - "│ Row │ Crim │ Zn │ Indus │ Chas │ NOx │ Rm │ Age │ Dis │ Rad │\n", - "├─────┼─────────┼──────┼───────┼──────┼────────┼───────┼──────┼────────┼─────┤\n", - "│ 1 │ 88.9762 │ 0.0 │ 18.1 │ 0 │ 0.671 │ 6.968 │ 91.9 │ 1.4165 │ 24 │\n", - "│ 2 │ 0.06047 │ 0.0 │ 2.46 │ 0 │ 0.488 │ 6.153 │ 68.8 │ 3.2797 │ 3 │\n", - "│ 3 │ 0.1712 │ 0.0 │ 8.56 │ 0 │ 0.52 │ 5.836 │ 91.9 │ 2.211 │ 5 │\n", - "│ 4 │ 0.54452 │ 0.0 │ 21.89 │ 0 │ 0.624 │ 6.151 │ 97.9 │ 1.6687 │ 4 │\n", - "│ 5 │ 0.03466 │ 35.0 │ 6.06 │ 0 │ 0.4379 │ 6.031 │ 23.3 │ 6.6407 │ 1 │\n", - "│ 6 │ 0.02009 │ 95.0 │ 2.68 │ 0 │ 0.4161 │ 8.034 │ 31.9 │ 5.118 │ 4 │" - ] - }, - "metadata": {}, - "output_type": "display_data", - "source": "julia" - }, - { - "data": { - "text/html": [ - "
MedV
110.4
229.6
319.5
417.8
519.4
650.0
" - ], - "text/plain": [ - "6×1 DataFrames.DataFrame\n", - "│ Row │ MedV │\n", - "├─────┼──────┤\n", - "│ 1 │ 10.4 │\n", - "│ 2 │ 29.6 │\n", - "│ 3 │ 19.5 │\n", - "│ 4 │ 17.8 │\n", - "│ 5 │ 19.4 │\n", - "│ 6 │ 50.0 │" - ] - }, - "metadata": {}, - "output_type": "display_data", - "source": "julia" - } - ], - "source": [ - "# Define labels\n", - "labelname = :MedV\n", - "\n", - "# Put features and labels in separate dataframes\n", - "features_df = df[featurenames]\n", - "labels_df = df[[labelname]]\n", - "\n", - "# Display for exploration\n", - "display(DataFrames.head(features_df))\n", - "display(DataFrames.head(labels_df))" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Summary Stats:\n", - "Mean: 22.532806\n", - "Minimum: 5.000000\n", - "1st Quartile: 17.025000\n", - "Median: 21.200000\n", - "3rd Quartile: 25.000000\n", - "Maximum: 50.000000\n", - "Length: 506\n", - "Type: Union{Float64, Missings.Missing}\n", - "Number Missing: 0\n", - "% Missing: 0.000000\n" - ] - } - ], - "source": [ - "# View summary statistics for label variable (mean, quartiles, etc.)\n", - "DataFrames.describe(labels_df[labelname])" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": {}, - "outputs": [], - "source": [ - "# Split data into training set (70%) and testing set (30%)\n", - "training_features_df,testing_features_df,traininglabels_df,testing_labels_df =\n", - " PredictMD.split_data(features_df,labels_df;training = 0.7,testing = 0.3,);" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Section 3: Set up and train models " - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Linear regression" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "\u001b[1m\u001b[36mINFO: \u001b[39m\u001b[22m\u001b[36mStarting to train GLM.jl model.\n", - "\u001b[39m\u001b[1m\u001b[36mINFO: \u001b[39m\u001b[22m\u001b[36mFinished training GLM.jl model.\n", - "\u001b[39m" - ] - }, - { - "data": { - "text/plain": [ - "PredictMD.MutableGLMjlGeneralizedLinearModelEstimator(\"\", false, true, Formula: MedV ~ 1 + Crim + Zn + Indus + Chas + NOx + Rm + Age + Dis + Rad + Tax + PTRatio + Black + LStat, Distributions.Normal{Float64}(μ=0.0, σ=1.0), GLM.IdentityLink(), StatsModels.DataFrameRegressionModel{GLM.GeneralizedLinearModel{GLM.GlmResp{Array{Float64,1},Distributions.Normal{Float64},GLM.IdentityLink},GLM.DensePredChol{Float64,Base.LinAlg.Cholesky{Float64,Array{Float64,2}}}},Array{Float64,2}}\n", - "\n", - "Formula: MedV ~ 1 + Crim + Zn + Indus + Chas + NOx + Rm + Age + Dis + Rad + Tax + PTRatio + Black + LStat\n", - "\n", - "Coefficients:\n", - " Estimate Std.Error z value Pr(>|z|)\n", - "(Intercept) 24.4713 5.32528 4.59532 <1e-5\n", - "Crim -0.0769675 0.0486607 -1.58172 0.1137\n", - "Zn 0.0358691 0.014526 2.4693 0.0135\n", - "Indus 0.0195405 0.064063 0.30502 0.7604\n", - "Chas 2.12993 0.865244 2.46165 0.0138\n", - "NOx -13.905 3.81156 -3.64811 0.0003\n", - "Rm 4.97063 0.422884 11.7541 <1e-31\n", - "Age -0.0086508 0.0137787 -0.627839 0.5301\n", - "Dis -1.23188 0.202631 -6.07944 <1e-8\n", - "Rad 0.238656 0.0687518 3.47127 0.0005\n", - "Tax -0.0120952 0.0037724 -3.20624 0.0013\n", - "PTRatio -0.813687 0.141471 -5.75162 <1e-8\n", - "Black 0.0082818 0.00284193 2.91415 0.0036\n", - "LStat -0.462357 0.0542812 -8.51781 <1e-16\n", - ")" - ] - }, - "execution_count": 11, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# Set up linear regression model\n", - "linearreg = PredictMD.singlelabeldataframelinearregression(\n", - " featurenames,\n", - " labelname;\n", - " package = :GLMjl,\n", - " intercept = true, # optional, defaults to true\n", - " name = \"Linear regression\", # optional\n", - " )\n", - "\n", - "if load_pretrained\n", - " PredictMD.load!(linearreg_filename, linearreg)\n", - "else\n", - " # set feature contrasts\n", - " PredictMD.set_feature_contrasts!(linearreg, contrasts)\n", - " # Train linear regression model\n", - " PredictMD.fit!(linearreg,training_features_df,traininglabels_df,)\n", - "end" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "StatsModels.DataFrameRegressionModel{GLM.GeneralizedLinearModel{GLM.GlmResp{Array{Float64,1},Distributions.Normal{Float64},GLM.IdentityLink},GLM.DensePredChol{Float64,Base.LinAlg.Cholesky{Float64,Array{Float64,2}}}},Array{Float64,2}}\n", - "\n", - "Formula: MedV ~ 1 + Crim + Zn + Indus + Chas + NOx + Rm + Age + Dis + Rad + Tax + PTRatio + Black + LStat\n", - "\n", - "Coefficients:\n", - " Estimate Std.Error z value Pr(>|z|)\n", - "(Intercept) 24.4713 5.32528 4.59532 <1e-5\n", - "Crim -0.0769675 0.0486607 -1.58172 0.1137\n", - "Zn 0.0358691 0.014526 2.4693 0.0135\n", - "Indus 0.0195405 0.064063 0.30502 0.7604\n", - "Chas 2.12993 0.865244 2.46165 0.0138\n", - "NOx -13.905 3.81156 -3.64811 0.0003\n", - "Rm 4.97063 0.422884 11.7541 <1e-31\n", - "Age -0.0086508 0.0137787 -0.627839 0.5301\n", - "Dis -1.23188 0.202631 -6.07944 <1e-8\n", - "Rad 0.238656 0.0687518 3.47127 0.0005\n", - "Tax -0.0120952 0.0037724 -3.20624 0.0013\n", - "PTRatio -0.813687 0.141471 -5.75162 <1e-8\n", - "Black 0.0082818 0.00284193 2.91415 0.0036\n", - "LStat -0.462357 0.0542812 -8.51781 <1e-16\n" - ] - }, - "execution_count": 12, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# View coefficients, p values, etc. for underlying linear regression\n", - "PredictMD.get_underlying(linearreg)" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "metadata": {}, - "outputs": [ - { - "data": { - "image/svg+xml": [ - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - " \n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - " \n", - " \n", - "\n", - "\n", - " \n", - " \n", - "\n", - "\n", - " \n", - " \n", - "\n", - "\n", - " \n", - " \n", - "\n", - "\n", - " \n", - " \n", - "\n", - "\n", - " \n", - "\n", - "\n", - " \n", - " \n", - "\n", - "\n", - " \n", - " \n", - "\n", - "\n", - " \n", - " \n", - "\n", - "\n", - " \n", - " \n", - "\n", - "\n", - " \n", - " \n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - " \n", - "\n", - "\n", - " \n", - " \n", - " \n", - "\n", - "\n", - " \n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "\n", - "\n", - " \n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - "\n", - "\n", - "\n", - "\n" - ], - "text/plain": [ - "TikzPictures.TikzPicture(\"\\\\begin{axis}[ylabel = {Predicted value}, xlabel = {True value}]\\\\addplot+ [only marks = {true}, black,fill=black]coordinates {\\n(17.3, 16.736149)\\n(18.5, 13.646128)\\n(14.9, 17.827953)\\n(31.1, 32.428055)\\n(12.6, 18.151867)\\n(29.6, 26.628582)\\n(23.1, 17.198875)\\n(19.1, 16.727407)\\n(7.0, 8.195805)\\n(19.4, 17.329445)\\n(21.6, 25.328691)\\n(14.2, 17.647497)\\n(29.6, 24.701982)\\n(21.8, 20.29327)\\n(11.9, 5.8298073)\\n(12.1, 18.358353)\\n(50.0, 31.074203)\\n(22.2, 24.383722)\\n(22.9, 21.68425)\\n(48.3, 38.752167)\\n(21.5, 21.136436)\\n(21.6, 25.401249)\\n(36.5, 35.765293)\\n(19.6, 17.709763)\\n(19.9, 18.857437)\\n(16.3, 10.817689)\\n(21.4, 21.33021)\\n(19.1, 16.378069)\\n(26.6, 23.987482)\\n(37.2, 33.28887)\\n(12.5, 19.244757)\\n(20.4, 19.97527)\\n(22.6, 27.152777)\\n(28.2, 32.46478)\\n(13.4, 14.224557)\\n(23.3, 25.909414)\\n(42.8, 33.149338)\\n(21.2, 23.172695)\\n(22.9, 22.8307)\\n(50.0, 42.071987)\\n(15.1, 16.33206)\\n(19.5, 18.240078)\\n(21.7, 23.124575)\\n(23.3, 22.330809)\\n(14.1, 17.436598)\\n(20.5, 20.645267)\\n(14.0, 14.5555935)\\n(43.1, 37.480362)\\n(45.4, 39.547504)\\n(36.4, 33.45718)\\n(23.7, 27.72526)\\n(31.5, 33.42933)\\n(13.6, 12.766285)\\n(16.4, 19.627922)\\n(15.2, 19.8014)\\n(27.9, 31.652023)\\n(23.0, 28.868126)\\n(19.8, 22.012247)\\n(35.1, 35.110283)\\n(17.8, 9.865548)\\n(36.0, 36.464134)\\n(18.4, 15.956775)\\n(10.5, 5.1123314)\\n(50.0, 38.051792)\\n(20.1, 17.880531)\\n(23.9, 25.280727)\\n(48.8, 42.25598)\\n(19.3, 20.391731)\\n(14.5, 18.101934)\\n(24.2, 25.972864)\\n(29.8, 25.867476)\\n(23.3, 28.597975)\\n(14.5, 18.463572)\\n(15.6, 16.187277)\\n(8.8, 8.086469)\\n(19.6, 21.143293)\\n(21.9, 13.656784)\\n(15.0, 19.692724)\\n(32.0, 32.943047)\\n(20.1, 16.06889)\\n(7.2, 10.877506)\\n(23.5, 29.742487)\\n(33.0, 25.277344)\\n(26.6, 28.219578)\\n(5.0, 7.021192)\\n(17.1, 20.014992)\\n(16.8, 20.441435)\\n(29.9, 31.464237)\\n(19.3, 21.213928)\\n(20.3, 22.489397)\\n(29.1, 31.53987)\\n(27.5, 31.794353)\\n(10.9, 18.853395)\\n(24.4, 28.6094)\\n(22.5, 17.955555)\\n(14.8, 15.580927)\\n(18.2, 18.458742)\\n(24.8, 27.306086)\\n(18.5, 19.713335)\\n(23.8, 25.374859)\\n(17.6, 16.14868)\\n(24.0, 24.865347)\\n(17.5, 17.48542)\\n(23.2, 17.072971)\\n(20.8, 16.742073)\\n(15.0, 26.176126)\\n(24.1, 25.830313)\\n(18.9, 15.694561)\\n(25.0, 22.44343)\\n(31.6, 32.683475)\\n(16.0, 18.502668)\\n(30.5, 30.389544)\\n(32.4, 35.44168)\\n(23.3, 25.687511)\\n(33.1, 34.53803)\\n(50.0, 41.689808)\\n(19.2, 23.10659)\\n(23.2, 25.277315)\\n(27.5, 25.177473)\\n(23.0, 20.608059)\\n(8.4, 6.057597)\\n(17.4, 17.461216)\\n(23.8, 25.81357)\\n(22.9, 24.99236)\\n(22.4, 24.476976)\\n(19.6, 22.035566)\\n(31.0, 34.726665)\\n(41.7, 39.66785)\\n(27.9, 18.546562)\\n(24.4, 25.08303)\\n(36.1, 33.857395)\\n(15.3, 18.550802)\\n(16.6, 15.87095)\\n(20.6, 18.90624)\\n(32.0, 33.72085)\\n(24.5, 27.978153)\\n(23.1, 21.781822)\\n(50.0, 40.54433)\\n(12.0, 10.678606)\\n(18.2, 19.731293)\\n(19.4, 22.914362)\\n(11.5, 13.154197)\\n(14.1, 16.504478)\\n(44.8, 39.71705)\\n(17.1, 17.651026)\\n(8.1, 3.665954)\\n(28.0, 28.967947)\\n(15.7, 14.806141)\\n(23.7, 28.613235)\\n(36.2, 29.36602)\\n(19.3, 20.246244)\\n(21.4, 24.55207)\\n(29.1, 30.175522)\\n(16.7, 19.630642)\\n(34.9, 34.52724)\\n(26.4, 28.9011)\\n(20.6, 20.511343)\\n(17.0, 22.268782)\\n(14.5, 14.287654)\\n(33.4, 29.787117)\\n(19.8, 22.883835)\\n(23.0, 22.785255)\\n(15.6, 20.457064)\\n(22.5, 22.374996)\\n(21.2, 21.634907)\\n(20.5, 20.265305)\\n(23.7, 11.950153)\\n(20.9, 20.929245)\\n(17.9, 0.883772)\\n(10.2, 5.588765)\\n(24.7, 23.060413)\\n(30.3, 32.954155)\\n(13.8, 15.010274)\\n(13.4, 17.452045)\\n(23.6, 28.557625)\\n(23.1, 7.64682)\\n(22.7, 23.837254)\\n(21.7, 22.207037)\\n(14.1, 19.311783)\\n(27.5, 20.376688)\\n(18.7, 20.46933)\\n(34.6, 35.00412)\\n(14.9, 16.37247)\\n(12.8, 12.675592)\\n(11.8, 13.055456)\\n(25.0, 23.892342)\\n(16.2, 20.17828)\\n(19.2, 20.696032)\\n(29.8, 31.850191)\\n(14.4, 9.51288)\\n(28.4, 30.50439)\\n(20.6, 17.476524)\\n(25.0, 24.715996)\\n(20.4, 23.01402)\\n(24.1, 21.758224)\\n(7.4, 5.292817)\\n(10.4, 6.9243073)\\n(20.8, 23.343079)\\n(13.8, 13.352503)\\n(42.3, 37.808254)\\n(20.0, 15.980459)\\n(22.2, 23.165483)\\n(50.0, 41.43461)\\n(25.3, 25.746778)\\n(48.5, 42.360542)\\n(23.2, 26.768522)\\n(21.0, 22.943863)\\n(31.7, 33.92857)\\n(23.1, 21.089355)\\n(20.7, 22.635946)\\n(30.1, 29.472164)\\n(17.4, 16.23881)\\n(22.0, 27.500486)\\n(16.5, 11.83041)\\n(50.0, 42.64759)\\n(38.7, 37.30888)\\n(17.4, 21.850758)\\n(11.7, 16.193548)\\n(19.5, 19.766085)\\n(21.4, 20.170752)\\n(18.5, 19.237188)\\n(20.3, 19.55215)\\n(13.0, 17.513165)\\n(15.4, 18.42379)\\n(21.7, 22.089254)\\n(20.1, 23.442852)\\n(21.1, 22.22984)\\n(28.7, 28.437202)\\n(19.0, 14.230543)\\n(30.1, 26.430563)\\n(22.6, 22.827946)\\n(23.4, 24.679668)\\n(16.7, 20.37278)\\n(13.1, 13.436633)\\n(24.3, 20.624985)\\n(20.4, 19.959572)\\n(10.2, 15.983807)\\n(19.3, 21.930544)\\n(33.2, 35.7261)\\n(37.3, 34.713413)\\n(22.6, 25.55806)\\n(21.4, 22.418352)\\n(33.1, 33.145348)\\n(17.5, 17.977823)\\n(8.3, 12.321828)\\n(22.0, 27.319862)\\n(13.1, 20.268963)\\n(16.2, 15.185098)\\n(50.0, 37.341923)\\n(13.4, 14.57748)\\n(21.7, 21.788918)\\n(19.7, 21.476099)\\n(19.6, 19.969387)\\n(17.8, 17.023058)\\n(22.6, 24.755707)\\n(50.0, 44.786453)\\n(17.8, 23.75223)\\n(21.5, 23.938078)\\n(20.1, 20.883425)\\n(18.2, 15.013014)\\n(9.6, 15.384541)\\n(12.3, 12.327995)\\n(22.6, 18.990295)\\n(17.5, 17.49703)\\n(17.8, 19.204044)\\n(25.2, 27.404333)\\n(13.8, 5.2026467)\\n(13.9, 13.719792)\\n(20.2, 22.34724)\\n(19.4, 17.299463)\\n(13.3, 14.219314)\\n(23.8, 25.94149)\\n(13.6, 13.996275)\\n(25.0, 25.689112)\\n(13.9, 18.40109)\\n(20.2, 16.356413)\\n(32.5, 30.572971)\\n(23.1, 25.722788)\\n(18.4, 18.519184)\\n(24.4, 24.154234)\\n(13.4, 13.3421)\\n(34.9, 30.61091)\\n(30.1, 34.36411)\\n(22.0, 21.31352)\\n(29.0, 32.434853)\\n(21.7, 24.30312)\\n(13.5, 13.0994625)\\n(11.8, 8.4980545)\\n(18.8, 20.23927)\\n(34.7, 31.609177)\\n(34.9, 34.62787)\\n(24.3, 27.775661)\\n(28.4, 28.838158)\\n(33.8, 34.963474)\\n(18.9, 23.412912)\\n(28.1, 25.743975)\\n(21.2, 22.006592)\\n(22.8, 26.697828)\\n(18.7, 17.857525)\\n(32.9, 31.365887)\\n(10.2, 16.905869)\\n(39.8, 35.90696)\\n(50.0, 44.79908)\\n(12.7, 12.118663)\\n(20.8, 17.403887)\\n(21.4, 24.151476)\\n(18.6, 20.358492)\\n(27.5, 10.324752)\\n(24.1, 28.402714)\\n(22.8, 28.736002)\\n(14.6, 18.776934)\\n(15.6, 12.431795)\\n(25.1, 29.66519)\\n(22.0, 28.904186)\\n(17.2, 15.721607)\\n(32.7, 31.42773)\\n(13.5, 14.662544)\\n(31.6, 32.88012)\\n(20.0, 22.03015)\\n(19.4, 23.539492)\\n(22.7, 21.430145)\\n(23.7, 26.700314)\\n(20.6, 22.610323)\\n(29.0, 31.29559)\\n(24.8, 25.77821)\\n(10.5, 12.914031)\\n(9.7, 8.862528)\\n(43.8, 36.680435)\\n(24.7, 25.166677)\\n(22.0, 21.922667)\\n(21.0, 21.712353)\\n(12.7, 12.849752)\\n(24.5, 20.741755)\\n(15.4, 15.006572)\\n(27.0, 31.144922)\\n(18.3, 20.009289)\\n(7.2, 7.6998496)\\n(14.3, 14.142037)\\n(19.1, 19.17415)\\n(24.5, 22.118635)\\n(19.5, 17.076689)\\n(7.5, 14.723886)\\n(5.6, 12.3436165)\\n(35.2, 36.46579)\\n(32.2, 32.226665)\\n(22.3, 26.958466)\\n(25.0, 24.735743)\\n(29.4, 30.72598)\\n(23.0, 24.407139)\\n(22.4, 22.187197)\\n(46.0, 40.558468)\\n(19.9, 19.606943)\\n(21.1, 21.375744)\\n(21.2, 21.286789)\\n};\\n\\\\addplot+ [mark = {none}, red]coordinates {\\n(5.0, 5.0)\\n(5.6, 5.6)\\n(7.0, 7.0)\\n(7.2, 7.2)\\n(7.4, 7.4)\\n(7.5, 7.5)\\n(8.1, 8.1)\\n(8.3, 8.3)\\n(8.4, 8.4)\\n(8.8, 8.8)\\n(9.6, 9.6)\\n(9.7, 9.7)\\n(10.2, 10.2)\\n(10.4, 10.4)\\n(10.5, 10.5)\\n(10.9, 10.9)\\n(11.5, 11.5)\\n(11.7, 11.7)\\n(11.8, 11.8)\\n(11.9, 11.9)\\n(12.0, 12.0)\\n(12.1, 12.1)\\n(12.3, 12.3)\\n(12.5, 12.5)\\n(12.6, 12.6)\\n(12.7, 12.7)\\n(12.8, 12.8)\\n(13.0, 13.0)\\n(13.1, 13.1)\\n(13.3, 13.3)\\n(13.4, 13.4)\\n(13.5, 13.5)\\n(13.6, 13.6)\\n(13.8, 13.8)\\n(13.9, 13.9)\\n(14.0, 14.0)\\n(14.1, 14.1)\\n(14.2, 14.2)\\n(14.3, 14.3)\\n(14.4, 14.4)\\n(14.5, 14.5)\\n(14.6, 14.6)\\n(14.8, 14.8)\\n(14.9, 14.9)\\n(15.0, 15.0)\\n(15.1, 15.1)\\n(15.2, 15.2)\\n(15.3, 15.3)\\n(15.4, 15.4)\\n(15.6, 15.6)\\n(15.7, 15.7)\\n(16.0, 16.0)\\n(16.2, 16.2)\\n(16.3, 16.3)\\n(16.4, 16.4)\\n(16.5, 16.5)\\n(16.6, 16.6)\\n(16.7, 16.7)\\n(16.8, 16.8)\\n(17.0, 17.0)\\n(17.1, 17.1)\\n(17.2, 17.2)\\n(17.3, 17.3)\\n(17.4, 17.4)\\n(17.5, 17.5)\\n(17.6, 17.6)\\n(17.8, 17.8)\\n(17.9, 17.9)\\n(18.2, 18.2)\\n(18.3, 18.3)\\n(18.4, 18.4)\\n(18.5, 18.5)\\n(18.6, 18.6)\\n(18.7, 18.7)\\n(18.8, 18.8)\\n(18.9, 18.9)\\n(19.0, 19.0)\\n(19.1, 19.1)\\n(19.2, 19.2)\\n(19.3, 19.3)\\n(19.4, 19.4)\\n(19.5, 19.5)\\n(19.6, 19.6)\\n(19.7, 19.7)\\n(19.8, 19.8)\\n(19.9, 19.9)\\n(20.0, 20.0)\\n(20.1, 20.1)\\n(20.2, 20.2)\\n(20.3, 20.3)\\n(20.4, 20.4)\\n(20.5, 20.5)\\n(20.6, 20.6)\\n(20.7, 20.7)\\n(20.8, 20.8)\\n(20.9, 20.9)\\n(21.0, 21.0)\\n(21.1, 21.1)\\n(21.2, 21.2)\\n(21.4, 21.4)\\n(21.5, 21.5)\\n(21.6, 21.6)\\n(21.7, 21.7)\\n(21.8, 21.8)\\n(21.9, 21.9)\\n(22.0, 22.0)\\n(22.2, 22.2)\\n(22.3, 22.3)\\n(22.4, 22.4)\\n(22.5, 22.5)\\n(22.6, 22.6)\\n(22.7, 22.7)\\n(22.8, 22.8)\\n(22.9, 22.9)\\n(23.0, 23.0)\\n(23.1, 23.1)\\n(23.2, 23.2)\\n(23.3, 23.3)\\n(23.4, 23.4)\\n(23.5, 23.5)\\n(23.6, 23.6)\\n(23.7, 23.7)\\n(23.8, 23.8)\\n(23.9, 23.9)\\n(24.0, 24.0)\\n(24.1, 24.1)\\n(24.2, 24.2)\\n(24.3, 24.3)\\n(24.4, 24.4)\\n(24.5, 24.5)\\n(24.7, 24.7)\\n(24.8, 24.8)\\n(25.0, 25.0)\\n(25.1, 25.1)\\n(25.2, 25.2)\\n(25.3, 25.3)\\n(26.4, 26.4)\\n(26.6, 26.6)\\n(27.0, 27.0)\\n(27.5, 27.5)\\n(27.9, 27.9)\\n(28.0, 28.0)\\n(28.1, 28.1)\\n(28.2, 28.2)\\n(28.4, 28.4)\\n(28.7, 28.7)\\n(29.0, 29.0)\\n(29.1, 29.1)\\n(29.4, 29.4)\\n(29.6, 29.6)\\n(29.8, 29.8)\\n(29.9, 29.9)\\n(30.1, 30.1)\\n(30.3, 30.3)\\n(30.5, 30.5)\\n(31.0, 31.0)\\n(31.1, 31.1)\\n(31.5, 31.5)\\n(31.6, 31.6)\\n(31.7, 31.7)\\n(32.0, 32.0)\\n(32.2, 32.2)\\n(32.4, 32.4)\\n(32.5, 32.5)\\n(32.7, 32.7)\\n(32.9, 32.9)\\n(33.0, 33.0)\\n(33.1, 33.1)\\n(33.2, 33.2)\\n(33.4, 33.4)\\n(33.8, 33.8)\\n(34.6, 34.6)\\n(34.7, 34.7)\\n(34.9, 34.9)\\n(35.1, 35.1)\\n(35.2, 35.2)\\n(36.0, 36.0)\\n(36.1, 36.1)\\n(36.2, 36.2)\\n(36.4, 36.4)\\n(36.5, 36.5)\\n(37.2, 37.2)\\n(37.3, 37.3)\\n(38.7, 38.7)\\n(39.8, 39.8)\\n(41.7, 41.7)\\n(42.3, 42.3)\\n(42.8, 42.8)\\n(43.1, 43.1)\\n(43.8, 43.8)\\n(44.8, 44.8)\\n(45.4, 45.4)\\n(46.0, 46.0)\\n(48.3, 48.3)\\n(48.5, 48.5)\\n(48.8, 48.8)\\n(50.0, 50.0)\\n};\\n\\\\end{axis}\\n\", \"\", \"\\\\usepackage{pgfplots}\\n\\\\pgfplotsset{compat=newest}\\n\\\\pgfplotsset{every axis legend/.append style={%\\ncells={anchor=west}}\\n}\\n\\\\usepgfplotslibrary{polar}\\n\\\\usetikzlibrary{arrows}\\n\\\\tikzset{>=stealth'}\\n\", true, true)" - ] - }, - "execution_count": 13, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# Plot true values versus predicted values for linear regression on training set\n", - "linearreg_plot_training = PredictMD.plotsinglelabelregressiontrueversuspredicted(\n", - " linearreg,\n", - " training_features_df,\n", - " traininglabels_df,\n", - " labelname,\n", - " )\n", - "# PredictMD.open(linearreg_plot_training)" - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "metadata": {}, - "outputs": [ - { - "data": { - "image/svg+xml": [ - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - " \n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - " \n", - " \n", - "\n", - "\n", - " \n", - " \n", - "\n", - "\n", - " \n", - " \n", - "\n", - "\n", - " \n", - " \n", - "\n", - "\n", - " \n", - " \n", - "\n", - "\n", - " \n", - "\n", - "\n", - " \n", - " \n", - "\n", - "\n", - " \n", - " \n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - " \n", - "\n", - "\n", - " \n", - " \n", - " \n", - "\n", - "\n", - " \n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "\n", - "\n", - " \n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - "\n", - "\n", - "\n", - "\n" - ], - "text/plain": [ - "TikzPictures.TikzPicture(\"\\\\begin{axis}[ylabel = {Predicted value}, xlabel = {True value}]\\\\addplot+ [only marks = {true}, black,fill=black]coordinates {\\n(10.4, 17.307629)\\n(50.0, 43.68415)\\n(50.0, 20.709995)\\n(15.2, 16.677622)\\n(26.2, 25.33278)\\n(5.0, 9.3874035)\\n(7.0, -3.9086676)\\n(27.1, 19.789768)\\n(35.4, 34.52805)\\n(17.7, 20.096237)\\n(8.8, 2.1361694)\\n(9.5, 13.730939)\\n(31.2, 29.224846)\\n(50.0, 23.794533)\\n(8.4, 16.102999)\\n(14.6, 8.366697)\\n(19.7, 13.81024)\\n(23.8, 22.47142)\\n(20.0, 18.262192)\\n(21.2, 23.28287)\\n(30.7, 31.478724)\\n(15.6, 13.613513)\\n(50.0, 24.077118)\\n(17.2, 17.49993)\\n(19.6, 18.02985)\\n(17.2, 12.870411)\\n(18.1, 18.064611)\\n(25.0, 28.712969)\\n(22.2, 25.673006)\\n(30.8, 31.261562)\\n(37.9, 34.100647)\\n(22.8, 24.82685)\\n(20.4, 20.74021)\\n(15.2, 11.52012)\\n(41.3, 32.948498)\\n(12.7, 17.3715)\\n(20.1, 21.384178)\\n(36.2, 27.345392)\\n(21.9, 23.98334)\\n(16.6, 18.47695)\\n(24.4, 23.256193)\\n(8.3, 10.314537)\\n(35.4, 32.197983)\\n(18.0, 19.6662)\\n(23.2, 22.118673)\\n(24.7, 25.29324)\\n(22.9, 28.733961)\\n(28.5, 33.394825)\\n(8.5, 8.071171)\\n(25.0, 28.484873)\\n(24.6, 28.59651)\\n(13.1, 16.417212)\\n(14.3, 17.425169)\\n(7.2, 17.945482)\\n(28.6, 29.06742)\\n(23.4, 24.195408)\\n(14.9, 17.46771)\\n(10.9, 15.015252)\\n(11.0, 14.626603)\\n(8.5, 15.578151)\\n(28.7, 31.541962)\\n(20.0, 20.628826)\\n(23.9, 25.937847)\\n(15.6, 15.87953)\\n(43.5, 39.560505)\\n(44.0, 37.83176)\\n(22.1, 26.588543)\\n(20.5, 23.930973)\\n(24.8, 30.811924)\\n(22.3, 26.321346)\\n(15.0, 15.1811)\\n(13.3, 15.881809)\\n(11.9, 22.414227)\\n(22.0, 26.895222)\\n(18.8, 20.483963)\\n(18.7, 21.676773)\\n(23.1, 25.260527)\\n(20.0, 22.935633)\\n(21.7, 21.091072)\\n(37.6, 38.608078)\\n(37.0, 31.232914)\\n(24.8, 26.205854)\\n(26.7, 32.697376)\\n(19.3, 16.21538)\\n(16.1, 18.547758)\\n(20.3, 23.25049)\\n(21.0, 22.033115)\\n(10.8, 11.705548)\\n(50.0, 33.34107)\\n(20.6, 26.524364)\\n(22.2, 20.075588)\\n(14.4, 4.202575)\\n(23.6, 31.073662)\\n(18.9, 21.47424)\\n(22.0, 27.96587)\\n(26.6, 29.2326)\\n(20.6, 22.669031)\\n(23.9, 28.478619)\\n(24.6, 25.59813)\\n(6.3, 11.171997)\\n(21.7, 21.0339)\\n(11.7, 13.868667)\\n(33.4, 35.88015)\\n(24.3, 23.945244)\\n(13.1, 14.552215)\\n(26.5, 26.242386)\\n(20.9, 21.722576)\\n(50.0, 36.999237)\\n(23.9, 27.208769)\\n(8.7, 9.807059)\\n(13.2, 10.261778)\\n(18.3, 18.978975)\\n(21.8, 20.126347)\\n(18.6, 17.569756)\\n(19.5, 20.540478)\\n(24.0, 29.953798)\\n(22.5, 28.753218)\\n(26.4, 21.79762)\\n(16.8, 19.042164)\\n(33.2, 33.268394)\\n(21.9, 39.008686)\\n(23.9, 27.884686)\\n(46.7, 36.8128)\\n(17.1, 20.082497)\\n(16.5, 23.001993)\\n(25.0, 28.20407)\\n(20.7, 24.877964)\\n(19.8, 18.457872)\\n(22.8, 26.728502)\\n(18.4, 20.518614)\\n(18.5, 24.739166)\\n(31.5, 31.224401)\\n(13.3, 20.684006)\\n(19.9, 19.583626)\\n(22.2, 24.358793)\\n(16.1, 21.27202)\\n(17.8, 17.893286)\\n(27.1, 27.708494)\\n(23.1, 24.648024)\\n(19.1, 23.385029)\\n(33.3, 36.318195)\\n(11.3, 12.701179)\\n(28.7, 25.798624)\\n(19.4, 20.018675)\\n(19.9, 17.377274)\\n(13.8, -0.6173121)\\n(19.4, 24.551264)\\n(20.3, 22.209385)\\n(13.8, 19.807957)\\n(16.1, 17.549986)\\n(18.9, 19.11404)\\n(19.0, 21.667042)\\n};\\n\\\\addplot+ [mark = {none}, red]coordinates {\\n(5.0, 5.0)\\n(6.3, 6.3)\\n(7.0, 7.0)\\n(7.2, 7.2)\\n(8.3, 8.3)\\n(8.4, 8.4)\\n(8.5, 8.5)\\n(8.7, 8.7)\\n(8.8, 8.8)\\n(9.5, 9.5)\\n(10.4, 10.4)\\n(10.8, 10.8)\\n(10.9, 10.9)\\n(11.0, 11.0)\\n(11.3, 11.3)\\n(11.7, 11.7)\\n(11.9, 11.9)\\n(12.7, 12.7)\\n(13.1, 13.1)\\n(13.2, 13.2)\\n(13.3, 13.3)\\n(13.8, 13.8)\\n(14.3, 14.3)\\n(14.4, 14.4)\\n(14.6, 14.6)\\n(14.9, 14.9)\\n(15.0, 15.0)\\n(15.2, 15.2)\\n(15.6, 15.6)\\n(16.1, 16.1)\\n(16.5, 16.5)\\n(16.6, 16.6)\\n(16.8, 16.8)\\n(17.1, 17.1)\\n(17.2, 17.2)\\n(17.7, 17.7)\\n(17.8, 17.8)\\n(18.0, 18.0)\\n(18.1, 18.1)\\n(18.3, 18.3)\\n(18.4, 18.4)\\n(18.5, 18.5)\\n(18.6, 18.6)\\n(18.7, 18.7)\\n(18.8, 18.8)\\n(18.9, 18.9)\\n(19.0, 19.0)\\n(19.1, 19.1)\\n(19.3, 19.3)\\n(19.4, 19.4)\\n(19.5, 19.5)\\n(19.6, 19.6)\\n(19.7, 19.7)\\n(19.8, 19.8)\\n(19.9, 19.9)\\n(20.0, 20.0)\\n(20.1, 20.1)\\n(20.3, 20.3)\\n(20.4, 20.4)\\n(20.5, 20.5)\\n(20.6, 20.6)\\n(20.7, 20.7)\\n(20.9, 20.9)\\n(21.0, 21.0)\\n(21.2, 21.2)\\n(21.7, 21.7)\\n(21.8, 21.8)\\n(21.9, 21.9)\\n(22.0, 22.0)\\n(22.1, 22.1)\\n(22.2, 22.2)\\n(22.3, 22.3)\\n(22.5, 22.5)\\n(22.8, 22.8)\\n(22.9, 22.9)\\n(23.1, 23.1)\\n(23.2, 23.2)\\n(23.4, 23.4)\\n(23.6, 23.6)\\n(23.8, 23.8)\\n(23.9, 23.9)\\n(24.0, 24.0)\\n(24.3, 24.3)\\n(24.4, 24.4)\\n(24.6, 24.6)\\n(24.7, 24.7)\\n(24.8, 24.8)\\n(25.0, 25.0)\\n(26.2, 26.2)\\n(26.4, 26.4)\\n(26.5, 26.5)\\n(26.6, 26.6)\\n(26.7, 26.7)\\n(27.1, 27.1)\\n(28.5, 28.5)\\n(28.6, 28.6)\\n(28.7, 28.7)\\n(30.7, 30.7)\\n(30.8, 30.8)\\n(31.2, 31.2)\\n(31.5, 31.5)\\n(33.2, 33.2)\\n(33.3, 33.3)\\n(33.4, 33.4)\\n(35.4, 35.4)\\n(36.2, 36.2)\\n(37.0, 37.0)\\n(37.6, 37.6)\\n(37.9, 37.9)\\n(41.3, 41.3)\\n(43.5, 43.5)\\n(44.0, 44.0)\\n(46.7, 46.7)\\n(50.0, 50.0)\\n};\\n\\\\end{axis}\\n\", \"\", \"\\\\usepackage{pgfplots}\\n\\\\pgfplotsset{compat=newest}\\n\\\\pgfplotsset{every axis legend/.append style={%\\ncells={anchor=west}}\\n}\\n\\\\usepgfplotslibrary{polar}\\n\\\\usetikzlibrary{arrows}\\n\\\\tikzset{>=stealth'}\\n\", true, true)" - ] - }, - "execution_count": 14, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# Plot true values versus predicted values for linear regression on testing set\n", - "linearreg_plot_testing = PredictMD.plotsinglelabelregressiontrueversuspredicted(\n", - " linearreg,\n", - " testing_features_df,\n", - " testing_labels_df,\n", - " labelname\n", - " )\n", - "# PredictMD.open(linearreg_plot_testing)" - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
metricLinear regression
1R^2 (coefficient of determination)0.800524
" - ], - "text/plain": [ - "1×2 DataFrames.DataFrame\n", - "│ Row │ metric │ Linear regression │\n", - "├─────┼────────────────────────────────────┼───────────────────┤\n", - "│ 1 │ R^2 (coefficient of determination) │ 0.800524 │" - ] - }, - "execution_count": 15, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# Evaluate performance of linear regression on training set\n", - "PredictMD.singlelabelregressionmetrics(\n", - " linearreg,\n", - " training_features_df,\n", - " traininglabels_df,\n", - " labelname,\n", - " )" - ] - }, - { - "cell_type": "code", - "execution_count": 16, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
metricLinear regression
1R^2 (coefficient of determination)0.59293
" - ], - "text/plain": [ - "1×2 DataFrames.DataFrame\n", - "│ Row │ metric │ Linear regression │\n", - "├─────┼────────────────────────────────────┼───────────────────┤\n", - "│ 1 │ R^2 (coefficient of determination) │ 0.59293 │" - ] - }, - "execution_count": 16, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# Evaluate performance of linear regression on testing set\n", - "PredictMD.singlelabelregressionmetrics(\n", - " linearreg,\n", - " testing_features_df,\n", - " testing_labels_df,\n", - " labelname,\n", - " )" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Section 4: Save trained models to file (if desired)" - ] - }, - { - "cell_type": "code", - "execution_count": 17, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "\u001b[1m\u001b[36mINFO: \u001b[39m\u001b[22m\u001b[36mSaved model to file ./linearreg.jld2\n", - "\u001b[39m" - ] - } - ], - "source": [ - "if save_trained\n", - " PredictMD.save(linearreg_filename, linearreg)\n", - "end" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Appendix A: Directly access the output of regression models" - ] - }, - { - "cell_type": "code", - "execution_count": 18, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
MedV
117.3076
243.6842
320.71
416.6776
525.3328
69.3874
7-3.90867
819.7898
934.528
1020.0962
112.13617
1213.7309
1329.2248
1423.7945
1516.103
168.3667
1713.8102
1822.4714
1918.2622
2023.2829
2131.4787
2213.6135
2324.0771
2417.4999
2518.0299
2612.8704
2718.0646
2828.713
2925.673
3031.2616
" - ], - "text/plain": [ - "152×1 DataFrames.DataFrame\n", - "│ Row │ MedV │\n", - "├─────┼───────────┤\n", - "│ 1 │ 17.3076 │\n", - "│ 2 │ 43.6842 │\n", - "│ 3 │ 20.71 │\n", - "│ 4 │ 16.6776 │\n", - "│ 5 │ 25.3328 │\n", - "│ 6 │ 9.3874 │\n", - "│ 7 │ -3.90867 │\n", - "│ 8 │ 19.7898 │\n", - "│ 9 │ 34.528 │\n", - "│ 10 │ 20.0962 │\n", - "│ 11 │ 2.13617 │\n", - "⋮\n", - "│ 141 │ 36.3182 │\n", - "│ 142 │ 12.7012 │\n", - "│ 143 │ 25.7986 │\n", - "│ 144 │ 20.0187 │\n", - "│ 145 │ 17.3773 │\n", - "│ 146 │ -0.617312 │\n", - "│ 147 │ 24.5513 │\n", - "│ 148 │ 22.2094 │\n", - "│ 149 │ 19.808 │\n", - "│ 150 │ 17.55 │\n", - "│ 151 │ 19.114 │\n", - "│ 152 │ 21.667 │" - ] - }, - "execution_count": 18, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# We can use the PredictMD.predict() function to get the real-valued predictions\n", - "# output by each of regression models.\n", - "\n", - "# Get real-valued predictions from each model for training set\n", - "PredictMD.predict(linearreg,training_features_df,)\n", - "\n", - "# Get real-valued predictions from each model for testing set\n", - "PredictMD.predict(linearreg,testing_features_df,)" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Julia 0.6.2", - "language": "julia", - "name": "julia-0.6" - }, - "language_info": { - "file_extension": ".jl", - "mimetype": "application/julia", - "name": "julia", - "version": "0.6.2" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/examples_old/boston_housing/boston_housing_linear_regression.jl b/examples_old/boston_housing/boston_housing_linear_regression.jl deleted file mode 100644 index 827e11d09..000000000 --- a/examples_old/boston_housing/boston_housing_linear_regression.jl +++ /dev/null @@ -1,149 +0,0 @@ - -# import required packages -import PredictMD -import CSV -import DataFrames -import GZip -import StatsBase - -# set the seed of the global random number generator -# this makes the results reproducible -srand(999) - -load_pretrained = false -save_trained = true - -# load_pretrained = true -# save_trained = false - -linearreg_filename = "./linearreg.jld2" -randomforestreg_filename = "./randomforestreg.jld2" -epsilonsvr_svmreg_filename = "./epsilonsvr_svmreg.jld2" -nusvr_svmreg_filename = "./nusvr_svmreg.jld2" -knetmlpreg_filename = "./knetmlpreg.jld2" - -# Import Boston housing data -df = CSV.read( - GZip.gzopen(joinpath(Pkg.dir("RDatasets"),"data","MASS","Boston.csv.gz")), - DataFrames.DataFrame, - ) - -#take a quick look at file header and few rows -DataFrames.head(df) - -# Remove rows with missing data -DataFrames.dropmissing!(df) - -# Shuffle rows -PredictMD.shuffle_rows!(df) - -# Define labels -categoricalfeaturenames = Symbol[] - -continuousfeaturenames = Symbol[ - :Crim, - :Zn, - :Indus, - :Chas, - :NOx, - :Rm, - :Age, - :Dis, - :Rad, - :Tax, - :PTRatio, - :Black, - :LStat, - ] -featurenames = vcat(categoricalfeaturenames, continuousfeaturenames) - -if load_pretrained -else - contrasts = PredictMD.contrasts(df, featurenames) -end - -# Define labels -labelname = :MedV - -# Put features and labels in separate dataframes -features_df = df[featurenames] -labels_df = df[[labelname]] - -# Display for exploration -display(DataFrames.head(features_df)) -display(DataFrames.head(labels_df)) - -# View summary statistics for label variable (mean, quartiles, etc.) -DataFrames.describe(labels_df[labelname]) - -# Split data into training set (70%) and testing set (30%) -training_features_df,testing_features_df,traininglabels_df,testing_labels_df = - PredictMD.split_data(features_df,labels_df,0.7); - -# Set up linear regression model -linearreg = PredictMD.singlelabeldataframelinearregression( - featurenames, - labelname; - package = :GLMjl, - intercept = true, # optional, defaults to true - name = "Linear regression", # optional - ) - -if load_pretrained - PredictMD.load!(linearreg_filename, linearreg) -else - # set feature contrasts - PredictMD.set_feature_contrasts!(linearreg , feature_contrasts) - # Train linear regression model - PredictMD.fit!(linearreg,training_features_df,traininglabels_df,) -end - -# View coefficients, p values, etc. for underlying linear regression -PredictMD.get_underlying(linearreg) - -# Plot true values versus predicted values for linear regression on training set -linearreg_plot_training = PredictMD.plotsinglelabelregressiontrueversuspredicted( - linearreg, - training_features_df, - traininglabels_df, - labelname, - ) -# PredictMD.open(linearreg_plot_training) - -# Plot true values versus predicted values for linear regression on testing set -linearreg_plot_testing = PredictMD.plotsinglelabelregressiontrueversuspredicted( - linearreg, - testing_features_df, - testing_labels_df, - labelname - ) -# PredictMD.open(linearreg_plot_testing) - -# Evaluate performance of linear regression on training set -PredictMD.singlelabelregressionmetrics( - linearreg, - training_features_df, - traininglabels_df, - labelname, - ) - -# Evaluate performance of linear regression on testing set -PredictMD.singlelabelregressionmetrics( - linearreg, - testing_features_df, - testing_labels_df, - labelname, - ) - -if save_trained - PredictMD.save(linearreg_filename, linearreg) -end - -# We can use the PredictMD.predict() function to get the real-valued predictions -# output by each of regression models. - -# Get real-valued predictions from each model for training set -PredictMD.predict(linearreg,training_features_df,) - -# Get real-valued predictions from each model for testing set -PredictMD.predict(linearreg,testing_features_df,) diff --git a/examples_old/boston_housing/boston_housing_metric_comparison.ipynb b/examples_old/boston_housing/boston_housing_metric_comparison.ipynb deleted file mode 100644 index 3dd9e7355..000000000 --- a/examples_old/boston_housing/boston_housing_metric_comparison.ipynb +++ /dev/null @@ -1,382 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Compare Regression Models\n", - "\n", - "* This assumes you have trained and save linear-regression, svm and mlp packages" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Section 1: Set Up" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "MersenneTwister(UInt32[0x000003e7], Base.dSFMT.DSFMT_state(Int32[-412893719, 1072748155, -748568654, 1073610384, -1271302057, 1073556021, -429186579, 1073162675, 932796209, 1073458022 … 1115928124, 1073598513, 1280798571, 1072732908, -581554620, 1977796709, 1774936613, -1100988421, 382, 0]), [1.62319, 1.35281, 1.03829, 1.06242, 1.31737, 1.67826, 1.16578, 1.98973, 1.90715, 1.53549 … 1.16349, 1.38708, 1.88594, 1.3401, 1.06464, 1.90276, 1.52995, 1.91265, 1.4553, 1.6623], 382)" - ] - }, - "execution_count": 1, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# import required packages\n", - "import PredictMD\n", - "import CSV\n", - "import DataFrames\n", - "import GZip\n", - "import StatsBase\n", - "\n", - "# set the seed of the global random number generator\n", - "# this makes the results reproducible\n", - "srand(999)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Section 2: Compare performance of all models " - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [], - "source": [ - "# Load and prepare data\n", - "\n", - "# Import Boston housing data\n", - "df = CSV.read(\n", - " GZip.gzopen(joinpath(Pkg.dir(\"RDatasets\"),\"data\",\"MASS\",\"Boston.csv.gz\")),\n", - " DataFrames.DataFrame,\n", - " )\n", - "\n", - "# Remove rows with missing data\n", - "DataFrames.dropmissing!(df)\n", - "\n", - "# Shuffle rows\n", - "PredictMD.shuffle_rows!(df)\n", - "\n", - "# Define labels\n", - "featurenames = Symbol[\n", - " :Crim,\n", - " :Zn,\n", - " :Indus,\n", - " :Chas,\n", - " :NOx,\n", - " :Rm,\n", - " :Age,\n", - " :Dis,\n", - " :Rad,\n", - " :Tax,\n", - " :PTRatio,\n", - " :Black,\n", - " :LStat,\n", - " ]\n", - "\n", - "labelname = :MedV\n", - "\n", - "# Put features and labels in separate dataframes\n", - "features_df = df[featurenames]\n", - "labels_df = df[[labelname]]\n", - "\n", - "# Split data into training set (70%) and testing set (30%)\n", - "training_features_df,testing_features_df,traininglabels_df,testing_labels_df =\n", - " PredictMD.split_data(features_df,labels_df;training = 0.7,testing = 0.3,);" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "\u001b[1m\u001b[36mINFO: \u001b[39m\u001b[22m\u001b[36mLoaded model from file ./linearreg.jld2\n", - "\u001b[39m" - ] - } - ], - "source": [ - "# load pre-trained models\n", - "linearreg_filename = \"./linearreg.jld2\"\n", - "\n", - "# Set up linear regression model\n", - "linearreg = PredictMD.singlelabeldataframelinearregression(\n", - " featurenames,\n", - " labelname;\n", - " package = :GLMjl,\n", - " intercept = true, # optional, defaults to true\n", - " name = \"Linear regression\", # optional\n", - " )\n", - "PredictMD.load!(linearreg_filename, linearreg)" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "\u001b[1m\u001b[36mINFO: \u001b[39m\u001b[22m\u001b[36mLoaded model from file ./randomforestreg.jld2\n", - "\u001b[39m" - ] - } - ], - "source": [ - "# Set up random forest regression model\n", - "randomforestreg_filename = \"./randomforestreg.jld2\"\n", - "\n", - "randomforestreg = PredictMD.singlelabeldataframerandomforestregression(\n", - " featurenames,\n", - " labelname;\n", - " nsubfeatures = 2, # number of subfeatures; defaults to 2\n", - " ntrees = 20, # number of trees; defaults to 10\n", - " package = :DecisionTreejl,\n", - " name = \"Random forest\" # optional\n", - " )\n", - "PredictMD.load!(randomforestreg_filename, randomforestreg)" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "\u001b[1m\u001b[36mINFO: \u001b[39m\u001b[22m\u001b[36mLoaded model from file ./epsilonsvr_svmreg.jld2\n", - "\u001b[39m" - ] - } - ], - "source": [ - "# Set up epsilon-SVR model\n", - "epsilonsvr_svmreg_filename = \"./epsilonsvr_svmreg.jld2\"\n", - "\n", - "epsilonsvr_svmreg = PredictMD.singlelabeldataframesvmregression(\n", - " featurenames,\n", - " labelname;\n", - " package = :LIBSVMjl,\n", - " svmtype = LIBSVM.EpsilonSVR,\n", - " name = \"SVM (epsilon-SVR)\",\n", - " kernel = LIBSVM.Kernel.Linear,\n", - " verbose = false,\n", - " )\n", - "PredictMD.load!(epsilonsvr_svmreg_filename, epsilonsvr_svmreg)" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "\u001b[1m\u001b[36mINFO: \u001b[39m\u001b[22m\u001b[36mLoaded model from file ./nusvr_svmreg.jld2\n", - "\u001b[39m" - ] - } - ], - "source": [ - "# Set up nu-SVR model\n", - "nusvr_svmreg_filename = \"./nusvr_svmreg.jld2\"\n", - "nusvr_svmreg = PredictMD.singlelabeldataframesvmregression(\n", - " featurenames,\n", - " labelname;\n", - " package = :LIBSVMjl,\n", - " svmtype = LIBSVM.NuSVR,\n", - " name = \"SVM (nu-SVR)\",\n", - " kernel = LIBSVM.Kernel.Linear,\n", - " verbose = false,\n", - " )\n", - "PredictMD.load!(nusvr_svmreg_filename, nusvr_svmreg)" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "\u001b[1m\u001b[36mINFO: \u001b[39m\u001b[22m\u001b[36mLoaded model from file ./knetmlpreg.jld2\n", - "\u001b[39m" - ] - } - ], - "source": [ - "\n", - "# Set up multilayer perceptron model\n", - "knetmlpreg_filename = \"./knetmlpreg.jld2\"\n", - "\n", - "#This should be defined somewhere else\n", - "\n", - "# Define predict function\n", - "function knetmlp_predict(\n", - " w, # don't put a type annotation on this\n", - " x0::AbstractArray;\n", - " training::Bool = false,\n", - " )\n", - " # x0 = input layer\n", - " # x1 = hidden layer\n", - " x1 = Knet.relu.( w[1]*x0 .+ w[2] ) # w[1] = weights, w[2] = biases\n", - " # x2 = output layer\n", - " x2 = w[3]*x1 .+ w[4] # w[3] = weights, w[4] = biases\n", - " return x2\n", - "end\n", - "\n", - "# Define loss function\n", - "function knetmlp_loss(\n", - " predict::Function,\n", - " modelweights, # don't put a type annotation on this\n", - " x::AbstractArray,\n", - " ytrue::AbstractArray;\n", - " L1::Real = Cfloat(0),\n", - " L2::Real = Cfloat(0),\n", - " )\n", - " loss = mean(\n", - " abs2,\n", - " ytrue - predict(modelweights, x),\n", - " )\n", - " if L1 != 0\n", - " loss += L1 * sum(sum(abs, w_i) for w_i in modelweights[1:2:end])\n", - " end\n", - " if L2 != 0\n", - " loss += L2 * sum(sum(abs2, w_i) for w_i in modelweights[1:2:end])\n", - " end\n", - " return loss\n", - "end\n", - "\n", - "# Define loss hyperparameters\n", - "knetmlp_losshyperparameters = Dict()\n", - "knetmlp_losshyperparameters[:L1] = Cfloat(0.0)\n", - "knetmlp_losshyperparameters[:L2] = Cfloat(0.0)\n", - "\n", - "# Select optimization algorithm\n", - "knetmlp_optimizationalgorithm = :Adam\n", - "\n", - "# Set optimization hyperparameters\n", - "knetmlp_optimizerhyperparameters = Dict()\n", - "\n", - "# Set the minibatch size\n", - "knetmlp_minibatchsize = 48\n", - "\n", - "# Set the max number of epochs. After training, look at the learning curve. If\n", - "# it looks like the model has not yet converged, raise maxepochs. If it looks\n", - "# like the loss has hit a plateau and you are worried about overfitting, lower\n", - "# maxepochs.\n", - "knetmlp_maxepochs = 500\n", - "\n", - "knetmlp_modelweights = Any[]\n", - "\n", - "knetmlpreg = PredictMD.singlelabeldataframeknetregression(\n", - " featurenames,\n", - " labelname;\n", - " package = :Knetjl,\n", - " name = \"Knet MLP\",\n", - " predict = knetmlp_predict,\n", - " loss = knetmlp_loss,\n", - " losshyperparameters = knetmlp_losshyperparameters,\n", - " optimizationalgorithm = knetmlp_optimizationalgorithm,\n", - " optimizerhyperparameters = knetmlp_optimizerhyperparameters,\n", - " minibatchsize = knetmlp_minibatchsize,\n", - " modelweights = knetmlp_modelweights,\n", - " maxepochs = knetmlp_maxepochs,\n", - " printlosseverynepochs = 100, # if 0, will not print at all\n", - " )\n", - "PredictMD.load!(knetmlpreg_filename, knetmlpreg)" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "1×6 DataFrames.DataFrame\n", - "│ Row │ metric │ Linear regression │ Random forest │ SVM (epsilon-SVR) │ SVM (nu-SVR) │ Knet MLP │\n", - "├─────┼────────────────────────────────────┼───────────────────┼───────────────┼───────────────────┼──────────────┼──────────┤\n", - "│ 1 │ R^2 (coefficient of determination) │ 0.800524 │ 0.929843 │ -6.30321 │ -6.30321 │ 0.731002 │1×6 DataFrames.DataFrame\n", - "│ Row │ metric │ Linear regression │ Random forest │ SVM (epsilon-SVR) │ SVM (nu-SVR) │ Knet MLP │\n", - "├─────┼────────────────────────────────────┼───────────────────┼───────────────┼───────────────────┼──────────────┼──────────┤\n", - "│ 1 │ R^2 (coefficient of determination) │ 0.59293 │ 0.699748 │ -5.42347 │ -5.42347 │ 0.553721 │" - ] - } - ], - "source": [ - "# Compare performance of all five models on training set\n", - "showall(PredictMD.singlelabelregressionmetrics(\n", - " [\n", - " linearreg,\n", - " randomforestreg,\n", - " epsilonsvr_svmreg,\n", - " nusvr_svmreg,\n", - " knetmlpreg,\n", - " ],\n", - " training_features_df,\n", - " traininglabels_df,\n", - " labelname,\n", - " ))\n", - "\n", - "# Compare performance of all models on testing set\n", - "showall(PredictMD.singlelabelregressionmetrics(\n", - " [\n", - " linearreg,\n", - " randomforestreg,\n", - " epsilonsvr_svmreg,\n", - " nusvr_svmreg,\n", - " knetmlpreg,\n", - " ],\n", - " testing_features_df,\n", - " testing_labels_df,\n", - " labelname,\n", - " ))" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Julia 0.6.2", - "language": "julia", - "name": "julia-0.6" - }, - "language_info": { - "file_extension": ".jl", - "mimetype": "application/julia", - "name": "julia", - "version": "0.6.2" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/examples_old/boston_housing/boston_housing_metric_comparison.jl b/examples_old/boston_housing/boston_housing_metric_comparison.jl deleted file mode 100644 index eb3cb6fbb..000000000 --- a/examples_old/boston_housing/boston_housing_metric_comparison.jl +++ /dev/null @@ -1,214 +0,0 @@ - -# import required packages -import PredictMD -import CSV -import DataFrames -import GZip -import StatsBase - -# set the seed of the global random number generator -# this makes the results reproducible -srand(999) - -# Load and prepare data - -# Import Boston housing data -df = CSV.read( - GZip.gzopen(joinpath(Pkg.dir("RDatasets"),"data","MASS","Boston.csv.gz")), - DataFrames.DataFrame, - ) - -# Remove rows with missing data -DataFrames.dropmissing!(df) - -# Shuffle rows -PredictMD.shuffle_rows!(df) - -# Define labels -featurenames = Symbol[ - :Crim, - :Zn, - :Indus, - :Chas, - :NOx, - :Rm, - :Age, - :Dis, - :Rad, - :Tax, - :PTRatio, - :Black, - :LStat, - ] - -labelname = :MedV - -# Put features and labels in separate dataframes -features_df = df[featurenames] -labels_df = df[[labelname]] - -# Split data into training set (70%) and testing set (30%) -training_features_df,testing_features_df,traininglabels_df,testing_labels_df = - PredictMD.split_data(features_df,labels_df,0.7); - -# load pre-trained models -linearreg_filename = "./linearreg.jld2" - -# Set up linear regression model -linearreg = PredictMD.singlelabeldataframelinearregression( - featurenames, - labelname; - package = :GLMjl, - intercept = true, # optional, defaults to true - name = "Linear regression", # optional - ) -PredictMD.load!(linearreg_filename, linearreg) - -# Set up random forest regression model -randomforestreg_filename = "./randomforestreg.jld2" - -randomforestreg = PredictMD.singlelabeldataframerandomforestregression( - featurenames, - labelname; - nsubfeatures = 2, # number of subfeatures; defaults to 2 - ntrees = 20, # number of trees; defaults to 10 - package = :DecisionTreejl, - name = "Random forest" # optional - ) -PredictMD.load!(randomforestreg_filename, randomforestreg) - -# Set up epsilon-SVR model -epsilonsvr_svmreg_filename = "./epsilonsvr_svmreg.jld2" - -epsilonsvr_svmreg = PredictMD.singlelabeldataframesvmregression( - featurenames, - labelname; - package = :LIBSVMjl, - svmtype = LIBSVM.EpsilonSVR, - name = "SVM (epsilon-SVR)", - kernel = LIBSVM.Kernel.Linear, - verbose = false, - ) -PredictMD.load!(epsilonsvr_svmreg_filename, epsilonsvr_svmreg) - -# Set up nu-SVR model -nusvr_svmreg_filename = "./nusvr_svmreg.jld2" -nusvr_svmreg = PredictMD.singlelabeldataframesvmregression( - featurenames, - labelname; - package = :LIBSVMjl, - svmtype = LIBSVM.NuSVR, - name = "SVM (nu-SVR)", - kernel = LIBSVM.Kernel.Linear, - verbose = false, - ) -PredictMD.load!(nusvr_svmreg_filename, nusvr_svmreg) - - -# Set up multilayer perceptron model -knetmlpreg_filename = "./knetmlpreg.jld2" - -#This should be defined somewhere else - -# Define predict function -function knetmlp_predict( - w, # don't put a type annotation on this - x0::AbstractArray; - training::Bool = false, - ) - # x0 = input layer - # x1 = hidden layer - x1 = Knet.relu.( w[1]*x0 .+ w[2] ) # w[1] = weights, w[2] = biases - # x2 = output layer - x2 = w[3]*x1 .+ w[4] # w[3] = weights, w[4] = biases - return x2 -end - -# Define loss function -function knetmlp_loss( - predict::Function, - modelweights, # don't put a type annotation on this - x::AbstractArray, - ytrue::AbstractArray; - L1::Real = Cfloat(0), - L2::Real = Cfloat(0), - ) - loss = mean( - abs2, - ytrue - predict(modelweights, x), - ) - if L1 != 0 - loss += L1 * sum(sum(abs, w_i) for w_i in modelweights[1:2:end]) - end - if L2 != 0 - loss += L2 * sum(sum(abs2, w_i) for w_i in modelweights[1:2:end]) - end - return loss -end - -# Define loss hyperparameters -knetmlp_losshyperparameters = Dict() -knetmlp_losshyperparameters[:L1] = Cfloat(0.0) -knetmlp_losshyperparameters[:L2] = Cfloat(0.0) - -# Select optimization algorithm -knetmlp_optimizationalgorithm = :Adam - -# Set optimization hyperparameters -knetmlp_optimizerhyperparameters = Dict() - -# Set the minibatch size -knetmlp_minibatchsize = 48 - -# Set the max number of epochs. After training, look at the learning curve. If -# it looks like the model has not yet converged, raise maxepochs. If it looks -# like the loss has hit a plateau and you are worried about overfitting, lower -# maxepochs. -knetmlp_maxepochs = 500 - -knetmlp_modelweights = Any[] - -knetmlpreg = PredictMD.singlelabeldataframeknetregression( - featurenames, - labelname; - package = :Knetjl, - name = "Knet MLP", - predict = knetmlp_predict, - loss = knetmlp_loss, - losshyperparameters = knetmlp_losshyperparameters, - optimizationalgorithm = knetmlp_optimizationalgorithm, - optimizerhyperparameters = knetmlp_optimizerhyperparameters, - minibatchsize = knetmlp_minibatchsize, - modelweights = knetmlp_modelweights, - maxepochs = knetmlp_maxepochs, - printlosseverynepochs = 100, # if 0, will not print at all - ) -PredictMD.load!(knetmlpreg_filename, knetmlpreg) - -# Compare performance of all five models on training set -showall(PredictMD.singlelabelregressionmetrics( - [ - linearreg, - randomforestreg, - epsilonsvr_svmreg, - nusvr_svmreg, - knetmlpreg, - ], - training_features_df, - traininglabels_df, - labelname, - )) - -# Compare performance of all models on testing set -showall(PredictMD.singlelabelregressionmetrics( - [ - linearreg, - randomforestreg, - epsilonsvr_svmreg, - nusvr_svmreg, - knetmlpreg, - ], - testing_features_df, - testing_labels_df, - labelname, - )) diff --git a/examples_old/boston_housing/boston_housing_mlp.ipynb b/examples_old/boston_housing/boston_housing_mlp.ipynb deleted file mode 100644 index 2f3bf980d..000000000 --- a/examples_old/boston_housing/boston_housing_mlp.ipynb +++ /dev/null @@ -1,2619 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Single-label Regression using Multiple Layer Perceptron" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Section 1: Setup " - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "MersenneTwister(UInt32[0x000003e7], Base.dSFMT.DSFMT_state(Int32[-412893719, 1072748155, -748568654, 1073610384, -1271302057, 1073556021, -429186579, 1073162675, 932796209, 1073458022 … 1115928124, 1073598513, 1280798571, 1072732908, -581554620, 1977796709, 1774936613, -1100988421, 382, 0]), [1.95356, 1.42529, 1.71404, 1.90562, 1.91634, 1.30429, 1.39097, 1.53682, 1.91387, 1.99986 … 1.7474, 1.37092, 1.02709, 1.45976, 1.54413, 1.04316, 1.03421, 1.03289, 1.82133, 1.35197], 382)" - ] - }, - "execution_count": 1, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# import required packages\n", - "import PredictMD\n", - "import CSV\n", - "import DataFrames\n", - "import GZip\n", - "import Knet\n", - "import StatsBase\n", - "\n", - "# set the seed of the global random number generator\n", - "# this makes the results reproducible\n", - "srand(999)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Running for the first time\n", - "\n", - "If you are running this file for the first time and/or if you do not have\n", - "any trained models saved to disk, uncomment the lines below to train a model and save it to disk" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "true" - ] - }, - "execution_count": 2, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "load_pretrained = false\n", - "save_trained = true" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Using a pre-trained model\n", - "\n", - "If you already have trained models saved, and you would like to load those. Uncomment the lines below" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [], - "source": [ - "# load_trained = true\n", - "# save_trained = false" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Set your paths" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "\"./knetmlpreg.jld2\"" - ] - }, - "execution_count": 4, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "knetmlpreg_filename = \"./knetmlpreg.jld2\"" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Section 2: Prepare data " - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
CrimZnIndusChasNOxRmAgeDisRadTaxPTRatioBlackLStatMedV
10.0063218.02.3100.5386.57565.24.09129615.3396.94.9824.0
20.027310.07.0700.4696.42178.94.9671224217.8396.99.1421.6
30.027290.07.0700.4697.18561.14.9671224217.8392.834.0334.7
40.032370.02.1800.4586.99845.86.0622322218.7394.632.9433.4
50.069050.02.1800.4587.14754.26.0622322218.7396.95.3336.2
60.029850.02.1800.4586.4358.76.0622322218.7394.125.2128.7
" - ], - "text/plain": [ - "6×14 DataFrames.DataFrame. Omitted printing of 5 columns\n", - "│ Row │ Crim │ Zn │ Indus │ Chas │ NOx │ Rm │ Age │ Dis │ Rad │\n", - "├─────┼─────────┼──────┼───────┼──────┼───────┼───────┼──────┼────────┼─────┤\n", - "│ 1 │ 0.00632 │ 18.0 │ 2.31 │ 0 │ 0.538 │ 6.575 │ 65.2 │ 4.09 │ 1 │\n", - "│ 2 │ 0.02731 │ 0.0 │ 7.07 │ 0 │ 0.469 │ 6.421 │ 78.9 │ 4.9671 │ 2 │\n", - "│ 3 │ 0.02729 │ 0.0 │ 7.07 │ 0 │ 0.469 │ 7.185 │ 61.1 │ 4.9671 │ 2 │\n", - "│ 4 │ 0.03237 │ 0.0 │ 2.18 │ 0 │ 0.458 │ 6.998 │ 45.8 │ 6.0622 │ 3 │\n", - "│ 5 │ 0.06905 │ 0.0 │ 2.18 │ 0 │ 0.458 │ 7.147 │ 54.2 │ 6.0622 │ 3 │\n", - "│ 6 │ 0.02985 │ 0.0 │ 2.18 │ 0 │ 0.458 │ 6.43 │ 58.7 │ 6.0622 │ 3 │" - ] - }, - "execution_count": 5, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# Import Boston housing data\n", - "df = CSV.read(\n", - " GZip.gzopen(joinpath(Pkg.dir(\"RDatasets\"),\"data\",\"MASS\",\"Boston.csv.gz\")),\n", - " DataFrames.DataFrame,\n", - " )\n", - "\n", - "#take a quick look at file header and few rows\n", - "DataFrames.head(df)" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "13-element Array{Symbol,1}:\n", - " :Crim \n", - " :Zn \n", - " :Indus \n", - " :Chas \n", - " :NOx \n", - " :Rm \n", - " :Age \n", - " :Dis \n", - " :Rad \n", - " :Tax \n", - " :PTRatio\n", - " :Black \n", - " :LStat " - ] - }, - "execution_count": 6, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# Remove rows with missing data\n", - "DataFrames.dropmissing!(df)\n", - "\n", - "# Shuffle rows\n", - "PredictMD.shuffle_rows!(df)\n", - "\n", - "# Define labels\n", - "categoricalfeaturenames = Symbol[]\n", - "\n", - "continuousfeaturenames = Symbol[\n", - " :Crim,\n", - " :Zn,\n", - " :Indus,\n", - " :Chas,\n", - " :NOx,\n", - " :Rm,\n", - " :Age,\n", - " :Dis,\n", - " :Rad,\n", - " :Tax,\n", - " :PTRatio,\n", - " :Black,\n", - " :LStat,\n", - " ]\n", - "featurenames = vcat(categoricalfeaturenames, continuousfeaturenames)" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "PredictMD.ImmutableDataFrameFeatureContrasts(Symbol[:Crim, :Zn, :Indus, :Chas, :NOx, :Rm, :Age, :Dis, :Rad, :Tax, :PTRatio, :Black, :LStat], 13, Dict{Symbol,StatsModels.ContrastsMatrix}(), 13)" - ] - }, - "execution_count": 7, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "if load_pretrained == \"true\"\n", - "else\n", - " contrasts = PredictMD.contrasts(df, featurenames)\n", - "end" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
CrimZnIndusChasNOxRmAgeDisRadTaxPTRatioBlackLStat
188.97620.018.100.6716.96891.91.41652466620.2396.917.21
20.060470.02.4600.4886.15368.83.2797319317.8387.1113.15
30.17120.08.5600.525.83691.92.211538420.9395.6718.66
40.544520.021.8900.6246.15197.91.6687443721.2396.918.46
50.0346635.06.0600.43796.03123.36.6407130416.9362.257.83
60.0200995.02.6800.41618.03431.95.118422414.7390.552.88
" - ], - "text/plain": [ - "6×13 DataFrames.DataFrame. Omitted printing of 4 columns\n", - "│ Row │ Crim │ Zn │ Indus │ Chas │ NOx │ Rm │ Age │ Dis │ Rad │\n", - "├─────┼─────────┼──────┼───────┼──────┼────────┼───────┼──────┼────────┼─────┤\n", - "│ 1 │ 88.9762 │ 0.0 │ 18.1 │ 0 │ 0.671 │ 6.968 │ 91.9 │ 1.4165 │ 24 │\n", - "│ 2 │ 0.06047 │ 0.0 │ 2.46 │ 0 │ 0.488 │ 6.153 │ 68.8 │ 3.2797 │ 3 │\n", - "│ 3 │ 0.1712 │ 0.0 │ 8.56 │ 0 │ 0.52 │ 5.836 │ 91.9 │ 2.211 │ 5 │\n", - "│ 4 │ 0.54452 │ 0.0 │ 21.89 │ 0 │ 0.624 │ 6.151 │ 97.9 │ 1.6687 │ 4 │\n", - "│ 5 │ 0.03466 │ 35.0 │ 6.06 │ 0 │ 0.4379 │ 6.031 │ 23.3 │ 6.6407 │ 1 │\n", - "│ 6 │ 0.02009 │ 95.0 │ 2.68 │ 0 │ 0.4161 │ 8.034 │ 31.9 │ 5.118 │ 4 │" - ] - }, - "metadata": {}, - "output_type": "display_data", - "source": "julia" - }, - { - "data": { - "text/html": [ - "
MedV
110.4
229.6
319.5
417.8
519.4
650.0
" - ], - "text/plain": [ - "6×1 DataFrames.DataFrame\n", - "│ Row │ MedV │\n", - "├─────┼──────┤\n", - "│ 1 │ 10.4 │\n", - "│ 2 │ 29.6 │\n", - "│ 3 │ 19.5 │\n", - "│ 4 │ 17.8 │\n", - "│ 5 │ 19.4 │\n", - "│ 6 │ 50.0 │" - ] - }, - "metadata": {}, - "output_type": "display_data", - "source": "julia" - } - ], - "source": [ - "# Define labels\n", - "labelname = :MedV\n", - "\n", - "# Put features and labels in separate dataframes\n", - "features_df = df[featurenames]\n", - "labels_df = df[[labelname]]\n", - "\n", - "# Display for exploration\n", - "display(DataFrames.head(features_df))\n", - "display(DataFrames.head(labels_df))" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Summary Stats:\n", - "Mean: 22.532806\n", - "Minimum: 5.000000\n", - "1st Quartile: 17.025000\n", - "Median: 21.200000\n", - "3rd Quartile: 25.000000\n", - "Maximum: 50.000000\n", - "Length: 506\n", - "Type: Union{Float64, Missings.Missing}\n", - "Number Missing: 0\n", - "% Missing: 0.000000\n" - ] - } - ], - "source": [ - "# View summary statistics for label variable (mean, quartiles, etc.)\n", - "DataFrames.describe(labels_df[labelname])" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": {}, - "outputs": [], - "source": [ - "# Split data into training set (70%) and testing set (30%)\n", - "training_features_df,testing_features_df,traininglabels_df,testing_labels_df =\n", - " PredictMD.split_data(features_df,labels_df;training = 0.7,testing = 0.3,);" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Section 3: Set up and train models " - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Multilayer perceptron (i.e. fully connected feedforward neural network)" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "knetmlp_predict (generic function with 1 method)" - ] - }, - "execution_count": 11, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# Define predict function\n", - "function knetmlp_predict(\n", - " w, # don't put a type annotation on this\n", - " x0::AbstractArray;\n", - " training::Bool = false,\n", - " )\n", - " # x0 = input layer\n", - " # x1 = hidden layer\n", - " x1 = Knet.relu.( w[1]*x0 .+ w[2] ) # w[1] = weights, w[2] = biases\n", - " # x2 = output layer\n", - " x2 = w[3]*x1 .+ w[4] # w[3] = weights, w[4] = biases\n", - " return x2\n", - "end" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "4-element Array{Any,1}:\n", - " Float32[-0.0897565 0.0974769 … -0.260205 0.0299987; 0.0725558 -0.0355528 … -0.0390284 0.0524487; … ; -0.0717704 -0.10111 … -0.0631411 -0.0306858; 0.00933819 -0.046184 … 0.326549 -0.0479391]\n", - " Float32[0.0; 0.0; … ; 0.0; 0.0] \n", - " Float32[0.03517 0.0896541 … -0.18434 0.0292523] \n", - " Float32[0.0] " - ] - }, - "execution_count": 12, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "if load_pretrained\n", - " # No need to initialize weights since we are going to load them from file\n", - " knetmlp_modelweights = Any[]\n", - "else\n", - " # Randomly initialize model weights\n", - " knetmlp_modelweights = Any[\n", - " # input layer has dimension contrasts.num_array_columns\n", - " #\n", - " # hidden layer (10 neurons):\n", - " Cfloat.(\n", - " 0.1f0*randn(Cfloat,10,contrasts.num_array_columns) # weights\n", - " ),\n", - " Cfloat.(\n", - " zeros(Cfloat,10,1) # biases\n", - " ),\n", - " #\n", - " # output layer (regression nets have exactly 1 neuron in output layer):\n", - " Cfloat.(\n", - " 0.1f0*randn(Cfloat,1,10) # weights\n", - " ),\n", - " Cfloat.(\n", - " zeros(Cfloat,1,1) # biases\n", - " ),\n", - " ]\n", - "end" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "knetmlp_loss (generic function with 1 method)" - ] - }, - "execution_count": 13, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# Define loss function\n", - "function knetmlp_loss(\n", - " predict::Function,\n", - " modelweights, # don't put a type annotation on this\n", - " x::AbstractArray,\n", - " ytrue::AbstractArray;\n", - " L1::Real = Cfloat(0),\n", - " L2::Real = Cfloat(0),\n", - " )\n", - " loss = mean(\n", - " abs2,\n", - " ytrue - predict(modelweights, x),\n", - " )\n", - " if L1 != 0\n", - " loss += L1 * sum(sum(abs, w_i) for w_i in modelweights[1:2:end])\n", - " end\n", - " if L2 != 0\n", - " loss += L2 * sum(sum(abs2, w_i) for w_i in modelweights[1:2:end])\n", - " end\n", - " return loss\n", - "end" - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "\u001b[1m\u001b[36mINFO: \u001b[39m\u001b[22m\u001b[36mStarting to train Knet.jl model. Max epochs: 500.\n", - "\u001b[39m\u001b[1m\u001b[36mINFO: \u001b[39m\u001b[22m\u001b[36mEpoch: 0. Loss: 293.5584.\n", - "\u001b[39m\u001b[1m\u001b[36mINFO: \u001b[39m\u001b[22m\u001b[36mEpoch: 100. Loss: 37.992123.\n", - "\u001b[39m\u001b[1m\u001b[36mINFO: \u001b[39m\u001b[22m\u001b[36mEpoch: 200. Loss: 27.809109.\n", - "\u001b[39m\u001b[1m\u001b[36mINFO: \u001b[39m\u001b[22m\u001b[36mEpoch: 300. Loss: 25.35328.\n", - "\u001b[39m\u001b[1m\u001b[36mINFO: \u001b[39m\u001b[22m\u001b[36mEpoch: 400. Loss: 23.582445.\n", - "\u001b[39m\u001b[1m\u001b[36mINFO: \u001b[39m\u001b[22m\u001b[36mEpoch: 500. Loss: 22.027395.\n", - "\u001b[39m\u001b[1m\u001b[36mINFO: \u001b[39m\u001b[22m\u001b[36mFinished training Knet.jl model.\n", - "\u001b[39m" - ] - }, - { - "data": { - "text/plain": [ - "PredictMD.KnetModel(\"Knet MLP\", false, true, knetmlp_predict, knetmlp_loss, Dict{Any,Any}(Pair{Any,Any}(:L2, 0.0),Pair{Any,Any}(:L1, 0.0)), :Adam, Dict{Any,Any}(), 48, 500, 100, Any[Float32[-0.104315 0.0974769 … -0.275 0.0135046; 0.0725558 -0.0355528 … -0.0390284 0.0524487; … ; 0.0200607 0.00669115 … 0.0202091 0.516147; -0.206359 -0.0222494 … 0.412806 -0.906976], Float32[-0.0175526; 0.0; … ; -0.571523; 0.380575], Float32[0.0236321 0.0896541 … -0.522333 0.206521], Float32[0.373899]], Knet.Adam[Knet.Adam(0.001, 0.0, 0.9, 0.999, 1.0e-8, 3500, Float32[5.60519f-45 0.0 … 5.60519f-45 5.60519f-45; 0.0 0.0 … 0.0 0.0; … ; -0.506289 -0.21575 … -26.1193 0.587646; 0.0708102 0.0859936 … 10.7848 0.0623458], Float32[2.12288f-5 0.0 … 2.27653f-5 3.43631f-5; 0.0 0.0 … 0.0 0.0; … ; 16.851 412.409 … 38030.4 109.662; 0.434212 76.4882 … 8933.93 11.7333]), Knet.Adam(0.001, 0.0, 0.9, 0.999, 1.0e-8, 3500, Float32[5.60519f-45; 0.0; … ; 0.00653508; 0.00265247], Float32[8.06275f-8; 0.0; … ; 0.305199; 0.0624154]), Knet.Adam(0.001, 0.0, 0.9, 0.999, 1.0e-8, 3500, Float32[5.60519f-45 0.0 … 11.2413 7.35248], Float32[0.00136076 0.0 … 3997.53 9407.41]), Knet.Adam(0.001, 0.0, 0.9, 0.999, 1.0e-8, 3500, Float32[-0.0125298], Float32[1.63609])], MVHistory{ValueHistories.History}\n", - " :lossatiteration => 3500 elements {Int64,Float32}\n", - " :lossatepoch => 500 elements {Int64,Float32}\n", - " :epochatiteration => 501 elements {Int64,Int64})" - ] - }, - "execution_count": 14, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# Define loss hyperparameters\n", - "knetmlp_losshyperparameters = Dict()\n", - "knetmlp_losshyperparameters[:L1] = Cfloat(0.0)\n", - "knetmlp_losshyperparameters[:L2] = Cfloat(0.0)\n", - "\n", - "# Select optimization algorithm\n", - "knetmlp_optimizationalgorithm = :Adam\n", - "\n", - "# Set optimization hyperparameters\n", - "knetmlp_optimizerhyperparameters = Dict()\n", - "\n", - "# Set the minibatch size\n", - "knetmlp_minibatchsize = 48\n", - "\n", - "# Set the max number of epochs. After training, look at the learning curve. If\n", - "# it looks like the model has not yet converged, raise maxepochs. If it looks\n", - "# like the loss has hit a plateau and you are worried about overfitting, lower\n", - "# maxepochs.\n", - "knetmlp_maxepochs = 500\n", - "\n", - "# Set up multilayer perceptron model\n", - "knetmlpreg = PredictMD.singlelabeldataframeknetregression(\n", - " featurenames,\n", - " labelname;\n", - " package = :Knetjl,\n", - " name = \"Knet MLP\",\n", - " predict = knetmlp_predict,\n", - " loss = knetmlp_loss,\n", - " losshyperparameters = knetmlp_losshyperparameters,\n", - " optimizationalgorithm = knetmlp_optimizationalgorithm,\n", - " optimizerhyperparameters = knetmlp_optimizerhyperparameters,\n", - " minibatchsize = knetmlp_minibatchsize,\n", - " modelweights = knetmlp_modelweights,\n", - " maxepochs = knetmlp_maxepochs,\n", - " printlosseverynepochs = 100, # if 0, will not print at all\n", - " )\n", - "\n", - "if load_pretrained == \"true\"\n", - " PredictMD.load!(knetmlpreg_filename, knetmlpreg)\n", - "else\n", - " # set feature contrasts\n", - " PredictMD.set_feature_contrasts!(knetmlpreg, contrasts)\n", - " # Train multilayer perceptron model on training set\n", - " PredictMD.fit!(knetmlpreg,training_features_df,traininglabels_df,)\n", - "end" - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "metadata": {}, - "outputs": [ - { - "data": { - "image/svg+xml": [ - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - " \n", - "\n", - "\n", - " \n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - " \n", - "\n", - "\n", - " \n", - " \n", - " \n", - "\n", - "\n", - " \n", - " \n", - " \n", - "\n", - "\n", - " \n", - " \n", - " \n", - "\n", - "\n", - " \n", - " \n", - " \n", - "\n", - "\n", - " \n", - " \n", - " \n", - "\n", - "\n", - " \n", - " \n", - "\n", - "\n", - " \n", - " \n", - " \n", - "\n", - "\n", - " \n", - " \n", - " \n", - "\n", - "\n", - " \n", - " \n", - " \n", - "\n", - "\n", - "\n", - " \n", - " \n", - "\n", - "\n", - " \n", - "\n", - "\n", - " \n", - "\n", - "\n", - " \n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - "\n", - "\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "\n", - "\n", - "\n", - "\n" - ], - "text/plain": [ - "TikzPictures.TikzPicture(\"\\\\begin{axis}[legend pos = {north east}, ylabel = {Loss}, xlabel = {Epoch}]\\\\addplot+ [mark = {none}]coordinates {\\n(1.0, 223.06328)\\n(2.0, 173.21321)\\n(3.0, 141.46103)\\n(4.0, 122.40349)\\n(5.0, 108.24619)\\n(6.0, 95.592224)\\n(7.0, 87.74377)\\n(8.0, 81.84494)\\n(9.0, 76.84017)\\n(10.0, 73.06547)\\n(11.0, 70.070984)\\n(12.0, 67.7723)\\n(13.0, 66.088936)\\n(14.0, 64.828064)\\n(15.0, 63.85856)\\n(16.0, 63.05433)\\n(17.0, 62.36548)\\n(18.0, 61.747314)\\n(19.0, 61.16389)\\n(20.0, 60.610268)\\n(21.0, 60.064587)\\n(22.0, 59.527206)\\n(23.0, 59.007698)\\n(24.0, 58.50957)\\n(25.0, 58.02146)\\n(26.0, 57.538124)\\n(27.0, 57.05319)\\n(28.0, 56.553425)\\n(29.0, 56.067455)\\n(30.0, 55.610546)\\n(31.0, 55.172955)\\n(32.0, 54.75757)\\n(33.0, 54.361973)\\n(34.0, 53.97814)\\n(35.0, 53.62207)\\n(36.0, 53.27738)\\n(37.0, 52.96509)\\n(38.0, 52.658104)\\n(39.0, 52.372902)\\n(40.0, 52.102177)\\n(41.0, 51.837803)\\n(42.0, 51.57438)\\n(43.0, 51.31314)\\n(44.0, 51.054626)\\n(45.0, 50.79847)\\n(46.0, 50.543674)\\n(47.0, 50.297806)\\n(48.0, 50.054356)\\n(49.0, 49.81176)\\n(50.0, 49.570618)\\n(51.0, 49.3295)\\n(52.0, 49.087868)\\n(53.0, 48.846485)\\n(54.0, 48.60569)\\n(55.0, 48.365387)\\n(56.0, 48.125496)\\n(57.0, 47.88594)\\n(58.0, 47.64662)\\n(59.0, 47.407482)\\n(60.0, 47.168488)\\n(61.0, 46.929634)\\n(62.0, 46.69105)\\n(63.0, 46.451656)\\n(64.0, 46.21241)\\n(65.0, 45.974354)\\n(66.0, 45.737015)\\n(67.0, 45.499157)\\n(68.0, 45.261364)\\n(69.0, 45.023895)\\n(70.0, 44.78783)\\n(71.0, 44.551495)\\n(72.0, 44.315464)\\n(73.0, 44.079823)\\n(74.0, 43.844658)\\n(75.0, 43.61019)\\n(76.0, 43.376358)\\n(77.0, 43.143158)\\n(78.0, 42.910656)\\n(79.0, 42.678837)\\n(80.0, 42.44774)\\n(81.0, 42.217422)\\n(82.0, 41.98793)\\n(83.0, 41.75931)\\n(84.0, 41.531734)\\n(85.0, 41.303646)\\n(86.0, 41.078163)\\n(87.0, 40.85367)\\n(88.0, 40.62936)\\n(89.0, 40.40781)\\n(90.0, 40.187065)\\n(91.0, 39.96654)\\n(92.0, 39.74912)\\n(93.0, 39.530823)\\n(94.0, 39.31653)\\n(95.0, 39.099842)\\n(96.0, 38.861702)\\n(97.0, 38.63483)\\n(98.0, 38.412132)\\n(99.0, 38.19727)\\n(100.0, 37.992123)\\n(101.0, 37.789356)\\n(102.0, 37.586628)\\n(103.0, 37.385006)\\n(104.0, 37.18302)\\n(105.0, 36.986244)\\n(106.0, 36.78993)\\n(107.0, 36.59751)\\n(108.0, 36.404724)\\n(109.0, 36.216812)\\n(110.0, 36.030388)\\n(111.0, 35.848106)\\n(112.0, 35.664627)\\n(113.0, 35.48706)\\n(114.0, 35.310913)\\n(115.0, 35.135635)\\n(116.0, 34.965942)\\n(117.0, 34.791847)\\n(118.0, 34.629368)\\n(119.0, 34.46447)\\n(120.0, 34.3047)\\n(121.0, 34.142803)\\n(122.0, 33.988853)\\n(123.0, 33.83632)\\n(124.0, 33.68352)\\n(125.0, 33.535957)\\n(126.0, 33.388794)\\n(127.0, 33.247246)\\n(128.0, 33.10628)\\n(129.0, 32.969383)\\n(130.0, 32.831684)\\n(131.0, 32.69996)\\n(132.0, 32.569195)\\n(133.0, 32.440968)\\n(134.0, 32.3164)\\n(135.0, 32.193874)\\n(136.0, 32.072674)\\n(137.0, 31.953243)\\n(138.0, 31.83847)\\n(139.0, 31.723965)\\n(140.0, 31.615562)\\n(141.0, 31.504421)\\n(142.0, 31.39866)\\n(143.0, 31.292215)\\n(144.0, 31.192945)\\n(145.0, 31.09136)\\n(146.0, 30.99442)\\n(147.0, 30.896706)\\n(148.0, 30.803484)\\n(149.0, 30.710905)\\n(150.0, 30.62153)\\n(151.0, 30.533352)\\n(152.0, 30.447464)\\n(153.0, 30.363104)\\n(154.0, 30.280676)\\n(155.0, 30.199936)\\n(156.0, 30.12095)\\n(157.0, 30.043835)\\n(158.0, 29.968433)\\n(159.0, 29.894617)\\n(160.0, 29.822327)\\n(161.0, 29.751545)\\n(162.0, 29.68222)\\n(163.0, 29.611607)\\n(164.0, 29.546535)\\n(165.0, 29.480198)\\n(166.0, 29.415451)\\n(167.0, 29.353886)\\n(168.0, 29.289608)\\n(169.0, 29.23223)\\n(170.0, 29.169563)\\n(171.0, 29.112953)\\n(172.0, 29.05605)\\n(173.0, 28.999556)\\n(174.0, 28.94486)\\n(175.0, 28.892511)\\n(176.0, 28.838318)\\n(177.0, 28.787252)\\n(178.0, 28.735125)\\n(179.0, 28.685614)\\n(180.0, 28.63617)\\n(181.0, 28.588428)\\n(182.0, 28.540983)\\n(183.0, 28.494715)\\n(184.0, 28.44896)\\n(185.0, 28.40418)\\n(186.0, 28.360037)\\n(187.0, 28.316727)\\n(188.0, 28.274057)\\n(189.0, 28.232101)\\n(190.0, 28.190762)\\n(191.0, 28.150074)\\n(192.0, 28.109985)\\n(193.0, 28.070494)\\n(194.0, 28.031565)\\n(195.0, 27.993202)\\n(196.0, 27.955368)\\n(197.0, 27.918066)\\n(198.0, 27.881256)\\n(199.0, 27.844948)\\n(200.0, 27.809109)\\n(201.0, 27.773735)\\n(202.0, 27.738806)\\n(203.0, 27.704313)\\n(204.0, 27.670244)\\n(205.0, 27.636581)\\n(206.0, 27.60332)\\n(207.0, 27.57044)\\n(208.0, 27.537937)\\n(209.0, 27.505804)\\n(210.0, 27.47401)\\n(211.0, 27.442568)\\n(212.0, 27.41146)\\n(213.0, 27.380682)\\n(214.0, 27.350222)\\n(215.0, 27.320059)\\n(216.0, 27.290194)\\n(217.0, 27.260626)\\n(218.0, 27.23134)\\n(219.0, 27.20232)\\n(220.0, 27.173576)\\n(221.0, 27.145082)\\n(222.0, 27.116852)\\n(223.0, 27.088856)\\n(224.0, 27.061104)\\n(225.0, 27.033585)\\n(226.0, 27.00629)\\n(227.0, 26.979227)\\n(228.0, 26.952372)\\n(229.0, 26.92572)\\n(230.0, 26.899277)\\n(231.0, 26.873035)\\n(232.0, 26.846983)\\n(233.0, 26.821129)\\n(234.0, 26.795458)\\n(235.0, 26.769957)\\n(236.0, 26.74463)\\n(237.0, 26.719479)\\n(238.0, 26.694496)\\n(239.0, 26.66968)\\n(240.0, 26.645012)\\n(241.0, 26.620508)\\n(242.0, 26.59615)\\n(243.0, 26.571945)\\n(244.0, 26.547882)\\n(245.0, 26.523956)\\n(246.0, 26.500172)\\n(247.0, 26.476519)\\n(248.0, 26.453009)\\n(249.0, 26.429611)\\n(250.0, 26.40635)\\n(251.0, 26.383204)\\n(252.0, 26.36018)\\n(253.0, 26.337275)\\n(254.0, 26.31449)\\n(255.0, 26.291811)\\n(256.0, 26.269241)\\n(257.0, 26.246788)\\n(258.0, 26.224428)\\n(259.0, 26.202175)\\n(260.0, 26.180063)\\n(261.0, 26.15805)\\n(262.0, 26.136131)\\n(263.0, 26.114313)\\n(264.0, 26.092398)\\n(265.0, 26.07059)\\n(266.0, 26.048866)\\n(267.0, 26.027222)\\n(268.0, 26.005672)\\n(269.0, 25.984198)\\n(270.0, 25.962818)\\n(271.0, 25.941511)\\n(272.0, 25.920292)\\n(273.0, 25.899149)\\n(274.0, 25.878073)\\n(275.0, 25.857077)\\n(276.0, 25.836159)\\n(277.0, 25.815313)\\n(278.0, 25.79454)\\n(279.0, 25.773834)\\n(280.0, 25.753197)\\n(281.0, 25.732632)\\n(282.0, 25.712118)\\n(283.0, 25.69169)\\n(284.0, 25.671312)\\n(285.0, 25.650997)\\n(286.0, 25.630754)\\n(287.0, 25.610567)\\n(288.0, 25.590443)\\n(289.0, 25.570374)\\n(290.0, 25.550362)\\n(291.0, 25.530415)\\n(292.0, 25.510513)\\n(293.0, 25.490673)\\n(294.0, 25.470884)\\n(295.0, 25.45116)\\n(296.0, 25.43148)\\n(297.0, 25.411856)\\n(298.0, 25.392282)\\n(299.0, 25.372755)\\n(300.0, 25.35328)\\n(301.0, 25.333855)\\n(302.0, 25.314478)\\n(303.0, 25.295149)\\n(304.0, 25.275866)\\n(305.0, 25.256632)\\n(306.0, 25.237436)\\n(307.0, 25.218304)\\n(308.0, 25.199202)\\n(309.0, 25.180145)\\n(310.0, 25.161135)\\n(311.0, 25.142172)\\n(312.0, 25.120897)\\n(313.0, 25.104315)\\n(314.0, 25.083319)\\n(315.0, 25.06655)\\n(316.0, 25.04711)\\n(317.0, 25.02894)\\n(318.0, 25.01003)\\n(319.0, 24.991453)\\n(320.0, 24.973057)\\n(321.0, 24.954514)\\n(322.0, 24.936192)\\n(323.0, 24.917665)\\n(324.0, 24.899292)\\n(325.0, 24.878494)\\n(326.0, 24.862473)\\n(327.0, 24.841913)\\n(328.0, 24.82567)\\n(329.0, 24.806768)\\n(330.0, 24.789011)\\n(331.0, 24.770657)\\n(332.0, 24.752491)\\n(333.0, 24.734606)\\n(334.0, 24.7165)\\n(335.0, 24.698639)\\n(336.0, 24.678196)\\n(337.0, 24.662584)\\n(338.0, 24.642336)\\n(339.0, 24.626472)\\n(340.0, 24.60799)\\n(341.0, 24.590534)\\n(342.0, 24.572594)\\n(343.0, 24.554739)\\n(344.0, 24.53724)\\n(345.0, 24.519459)\\n(346.0, 24.49958)\\n(347.0, 24.484154)\\n(348.0, 24.464287)\\n(349.0, 24.448668)\\n(350.0, 24.430534)\\n(351.0, 24.413334)\\n(352.0, 24.395744)\\n(353.0, 24.37815)\\n(354.0, 24.360962)\\n(355.0, 24.343464)\\n(356.0, 24.32386)\\n(357.0, 24.308704)\\n(358.0, 24.289124)\\n(359.0, 24.273758)\\n(360.0, 24.255941)\\n(361.0, 24.238949)\\n(362.0, 24.22168)\\n(363.0, 24.204332)\\n(364.0, 24.187433)\\n(365.0, 24.167816)\\n(366.0, 24.15316)\\n(367.0, 24.133715)\\n(368.0, 24.118605)\\n(369.0, 24.10106)\\n(370.0, 24.08423)\\n(371.0, 24.067265)\\n(372.0, 24.0501)\\n(373.0, 24.031078)\\n(374.0, 24.01639)\\n(375.0, 23.997385)\\n(376.0, 23.98236)\\n(377.0, 23.965086)\\n(378.0, 23.94841)\\n(379.0, 23.931664)\\n(380.0, 23.914698)\\n(381.0, 23.895866)\\n(382.0, 23.881367)\\n(383.0, 23.86255)\\n(384.0, 23.847694)\\n(385.0, 23.830647)\\n(386.0, 23.814112)\\n(387.0, 23.797592)\\n(388.0, 23.780787)\\n(389.0, 23.762156)\\n(390.0, 23.74782)\\n(391.0, 23.7292)\\n(392.0, 23.714485)\\n(393.0, 23.697662)\\n(394.0, 23.68126)\\n(395.0, 23.664957)\\n(396.0, 23.645924)\\n(397.0, 23.632206)\\n(398.0, 23.613497)\\n(399.0, 23.599113)\\n(400.0, 23.582445)\\n(401.0, 23.566141)\\n(402.0, 23.550053)\\n(403.0, 23.531122)\\n(404.0, 23.517601)\\n(405.0, 23.499043)\\n(406.0, 23.484777)\\n(407.0, 23.468304)\\n(408.0, 23.452099)\\n(409.0, 23.436203)\\n(410.0, 23.417406)\\n(411.0, 23.40401)\\n(412.0, 23.385632)\\n(413.0, 23.37146)\\n(414.0, 23.35519)\\n(415.0, 23.339071)\\n(416.0, 23.320948)\\n(417.0, 23.30703)\\n(418.0, 23.289259)\\n(419.0, 23.274988)\\n(420.0, 23.259018)\\n(421.0, 23.243015)\\n(422.0, 23.224968)\\n(423.0, 23.211235)\\n(424.0, 23.193546)\\n(425.0, 23.179415)\\n(426.0, 23.163586)\\n(427.0, 23.14767)\\n(428.0, 23.12978)\\n(429.0, 23.116135)\\n(430.0, 23.098595)\\n(431.0, 23.084549)\\n(432.0, 23.068882)\\n(433.0, 23.053055)\\n(434.0, 23.035305)\\n(435.0, 23.021755)\\n(436.0, 23.00435)\\n(437.0, 22.9904)\\n(438.0, 22.974876)\\n(439.0, 22.956715)\\n(440.0, 22.943897)\\n(441.0, 22.923948)\\n(442.0, 22.906906)\\n(443.0, 22.890488)\\n(444.0, 22.873043)\\n(445.0, 22.857544)\\n(446.0, 22.840357)\\n(447.0, 22.823793)\\n(448.0, 22.809107)\\n(449.0, 22.793236)\\n(450.0, 22.77912)\\n(451.0, 22.762672)\\n(452.0, 22.747843)\\n(453.0, 22.730606)\\n(454.0, 22.717327)\\n(455.0, 22.701193)\\n(456.0, 22.685354)\\n(457.0, 22.670223)\\n(458.0, 22.655508)\\n(459.0, 22.640131)\\n(460.0, 22.624937)\\n(461.0, 22.607918)\\n(462.0, 22.594961)\\n(463.0, 22.577322)\\n(464.0, 22.564491)\\n(465.0, 22.548462)\\n(466.0, 22.531822)\\n(467.0, 22.518866)\\n(468.0, 22.502504)\\n(469.0, 22.486551)\\n(470.0, 22.473547)\\n(471.0, 22.455797)\\n(472.0, 22.443228)\\n(473.0, 22.427263)\\n(474.0, 22.41086)\\n(475.0, 22.398018)\\n(476.0, 22.381836)\\n(477.0, 22.365965)\\n(478.0, 22.352936)\\n(479.0, 22.336916)\\n(480.0, 22.32106)\\n(481.0, 22.30821)\\n(482.0, 22.29072)\\n(483.0, 22.278328)\\n(484.0, 22.262655)\\n(485.0, 22.24632)\\n(486.0, 22.233814)\\n(487.0, 22.2178)\\n(488.0, 22.202124)\\n(489.0, 22.189537)\\n(490.0, 22.17208)\\n(491.0, 22.159922)\\n(492.0, 22.14439)\\n(493.0, 22.128138)\\n(494.0, 22.11591)\\n(495.0, 22.099993)\\n(496.0, 22.084505)\\n(497.0, 22.072044)\\n(498.0, 22.05472)\\n(499.0, 22.042782)\\n(500.0, 22.027395)\\n};\\n\\\\addlegendentry{Loss function}\\n\\\\end{axis}\\n\", \"\", \"\\\\usepackage{pgfplots}\\n\\\\pgfplotsset{compat=newest}\\n\\\\pgfplotsset{every axis legend/.append style={%\\ncells={anchor=west}}\\n}\\n\\\\usepgfplotslibrary{polar}\\n\\\\usetikzlibrary{arrows}\\n\\\\tikzset{>=stealth'}\\n\", true, true)" - ] - }, - "execution_count": 15, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# Plot learning curve: loss vs. epoch\n", - "knet_learningcurve_lossvsepoch = PredictMD.plotlearningcurve(\n", - " knetmlpreg,\n", - " :lossvsepoch;\n", - " )" - ] - }, - { - "cell_type": "code", - "execution_count": 16, - "metadata": {}, - "outputs": [ - { - "data": { - "image/svg+xml": [ - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - " \n", - "\n", - "\n", - " \n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - " \n", - "\n", - "\n", - " \n", - " \n", - " \n", - "\n", - "\n", - " \n", - " \n", - " \n", - "\n", - "\n", - " \n", - " \n", - " \n", - "\n", - "\n", - " \n", - " \n", - " \n", - "\n", - "\n", - " \n", - " \n", - " \n", - "\n", - "\n", - " \n", - " \n", - "\n", - "\n", - " \n", - " \n", - "\n", - "\n", - " \n", - " \n", - "\n", - "\n", - "\n", - " \n", - " \n", - "\n", - "\n", - " \n", - "\n", - "\n", - " \n", - "\n", - "\n", - " \n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - "\n", - "\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "\n", - "\n", - "\n", - "\n" - ], - "text/plain": [ - "TikzPictures.TikzPicture(\"\\\\begin{axis}[legend pos = {north east}, ylabel = {Loss}, xlabel = {Epoch}]\\\\addplot+ [mark = {none}]coordinates {\\n(10.0, 73.06547)\\n(11.0, 70.070984)\\n(12.0, 67.7723)\\n(13.0, 66.088936)\\n(14.0, 64.828064)\\n(15.0, 63.85856)\\n(16.0, 63.05433)\\n(17.0, 62.36548)\\n(18.0, 61.747314)\\n(19.0, 61.16389)\\n(20.0, 60.610268)\\n(21.0, 60.064587)\\n(22.0, 59.527206)\\n(23.0, 59.007698)\\n(24.0, 58.50957)\\n(25.0, 58.02146)\\n(26.0, 57.538124)\\n(27.0, 57.05319)\\n(28.0, 56.553425)\\n(29.0, 56.067455)\\n(30.0, 55.610546)\\n(31.0, 55.172955)\\n(32.0, 54.75757)\\n(33.0, 54.361973)\\n(34.0, 53.97814)\\n(35.0, 53.62207)\\n(36.0, 53.27738)\\n(37.0, 52.96509)\\n(38.0, 52.658104)\\n(39.0, 52.372902)\\n(40.0, 52.102177)\\n(41.0, 51.837803)\\n(42.0, 51.57438)\\n(43.0, 51.31314)\\n(44.0, 51.054626)\\n(45.0, 50.79847)\\n(46.0, 50.543674)\\n(47.0, 50.297806)\\n(48.0, 50.054356)\\n(49.0, 49.81176)\\n(50.0, 49.570618)\\n(51.0, 49.3295)\\n(52.0, 49.087868)\\n(53.0, 48.846485)\\n(54.0, 48.60569)\\n(55.0, 48.365387)\\n(56.0, 48.125496)\\n(57.0, 47.88594)\\n(58.0, 47.64662)\\n(59.0, 47.407482)\\n(60.0, 47.168488)\\n(61.0, 46.929634)\\n(62.0, 46.69105)\\n(63.0, 46.451656)\\n(64.0, 46.21241)\\n(65.0, 45.974354)\\n(66.0, 45.737015)\\n(67.0, 45.499157)\\n(68.0, 45.261364)\\n(69.0, 45.023895)\\n(70.0, 44.78783)\\n(71.0, 44.551495)\\n(72.0, 44.315464)\\n(73.0, 44.079823)\\n(74.0, 43.844658)\\n(75.0, 43.61019)\\n(76.0, 43.376358)\\n(77.0, 43.143158)\\n(78.0, 42.910656)\\n(79.0, 42.678837)\\n(80.0, 42.44774)\\n(81.0, 42.217422)\\n(82.0, 41.98793)\\n(83.0, 41.75931)\\n(84.0, 41.531734)\\n(85.0, 41.303646)\\n(86.0, 41.078163)\\n(87.0, 40.85367)\\n(88.0, 40.62936)\\n(89.0, 40.40781)\\n(90.0, 40.187065)\\n(91.0, 39.96654)\\n(92.0, 39.74912)\\n(93.0, 39.530823)\\n(94.0, 39.31653)\\n(95.0, 39.099842)\\n(96.0, 38.861702)\\n(97.0, 38.63483)\\n(98.0, 38.412132)\\n(99.0, 38.19727)\\n(100.0, 37.992123)\\n(101.0, 37.789356)\\n(102.0, 37.586628)\\n(103.0, 37.385006)\\n(104.0, 37.18302)\\n(105.0, 36.986244)\\n(106.0, 36.78993)\\n(107.0, 36.59751)\\n(108.0, 36.404724)\\n(109.0, 36.216812)\\n(110.0, 36.030388)\\n(111.0, 35.848106)\\n(112.0, 35.664627)\\n(113.0, 35.48706)\\n(114.0, 35.310913)\\n(115.0, 35.135635)\\n(116.0, 34.965942)\\n(117.0, 34.791847)\\n(118.0, 34.629368)\\n(119.0, 34.46447)\\n(120.0, 34.3047)\\n(121.0, 34.142803)\\n(122.0, 33.988853)\\n(123.0, 33.83632)\\n(124.0, 33.68352)\\n(125.0, 33.535957)\\n(126.0, 33.388794)\\n(127.0, 33.247246)\\n(128.0, 33.10628)\\n(129.0, 32.969383)\\n(130.0, 32.831684)\\n(131.0, 32.69996)\\n(132.0, 32.569195)\\n(133.0, 32.440968)\\n(134.0, 32.3164)\\n(135.0, 32.193874)\\n(136.0, 32.072674)\\n(137.0, 31.953243)\\n(138.0, 31.83847)\\n(139.0, 31.723965)\\n(140.0, 31.615562)\\n(141.0, 31.504421)\\n(142.0, 31.39866)\\n(143.0, 31.292215)\\n(144.0, 31.192945)\\n(145.0, 31.09136)\\n(146.0, 30.99442)\\n(147.0, 30.896706)\\n(148.0, 30.803484)\\n(149.0, 30.710905)\\n(150.0, 30.62153)\\n(151.0, 30.533352)\\n(152.0, 30.447464)\\n(153.0, 30.363104)\\n(154.0, 30.280676)\\n(155.0, 30.199936)\\n(156.0, 30.12095)\\n(157.0, 30.043835)\\n(158.0, 29.968433)\\n(159.0, 29.894617)\\n(160.0, 29.822327)\\n(161.0, 29.751545)\\n(162.0, 29.68222)\\n(163.0, 29.611607)\\n(164.0, 29.546535)\\n(165.0, 29.480198)\\n(166.0, 29.415451)\\n(167.0, 29.353886)\\n(168.0, 29.289608)\\n(169.0, 29.23223)\\n(170.0, 29.169563)\\n(171.0, 29.112953)\\n(172.0, 29.05605)\\n(173.0, 28.999556)\\n(174.0, 28.94486)\\n(175.0, 28.892511)\\n(176.0, 28.838318)\\n(177.0, 28.787252)\\n(178.0, 28.735125)\\n(179.0, 28.685614)\\n(180.0, 28.63617)\\n(181.0, 28.588428)\\n(182.0, 28.540983)\\n(183.0, 28.494715)\\n(184.0, 28.44896)\\n(185.0, 28.40418)\\n(186.0, 28.360037)\\n(187.0, 28.316727)\\n(188.0, 28.274057)\\n(189.0, 28.232101)\\n(190.0, 28.190762)\\n(191.0, 28.150074)\\n(192.0, 28.109985)\\n(193.0, 28.070494)\\n(194.0, 28.031565)\\n(195.0, 27.993202)\\n(196.0, 27.955368)\\n(197.0, 27.918066)\\n(198.0, 27.881256)\\n(199.0, 27.844948)\\n(200.0, 27.809109)\\n(201.0, 27.773735)\\n(202.0, 27.738806)\\n(203.0, 27.704313)\\n(204.0, 27.670244)\\n(205.0, 27.636581)\\n(206.0, 27.60332)\\n(207.0, 27.57044)\\n(208.0, 27.537937)\\n(209.0, 27.505804)\\n(210.0, 27.47401)\\n(211.0, 27.442568)\\n(212.0, 27.41146)\\n(213.0, 27.380682)\\n(214.0, 27.350222)\\n(215.0, 27.320059)\\n(216.0, 27.290194)\\n(217.0, 27.260626)\\n(218.0, 27.23134)\\n(219.0, 27.20232)\\n(220.0, 27.173576)\\n(221.0, 27.145082)\\n(222.0, 27.116852)\\n(223.0, 27.088856)\\n(224.0, 27.061104)\\n(225.0, 27.033585)\\n(226.0, 27.00629)\\n(227.0, 26.979227)\\n(228.0, 26.952372)\\n(229.0, 26.92572)\\n(230.0, 26.899277)\\n(231.0, 26.873035)\\n(232.0, 26.846983)\\n(233.0, 26.821129)\\n(234.0, 26.795458)\\n(235.0, 26.769957)\\n(236.0, 26.74463)\\n(237.0, 26.719479)\\n(238.0, 26.694496)\\n(239.0, 26.66968)\\n(240.0, 26.645012)\\n(241.0, 26.620508)\\n(242.0, 26.59615)\\n(243.0, 26.571945)\\n(244.0, 26.547882)\\n(245.0, 26.523956)\\n(246.0, 26.500172)\\n(247.0, 26.476519)\\n(248.0, 26.453009)\\n(249.0, 26.429611)\\n(250.0, 26.40635)\\n(251.0, 26.383204)\\n(252.0, 26.36018)\\n(253.0, 26.337275)\\n(254.0, 26.31449)\\n(255.0, 26.291811)\\n(256.0, 26.269241)\\n(257.0, 26.246788)\\n(258.0, 26.224428)\\n(259.0, 26.202175)\\n(260.0, 26.180063)\\n(261.0, 26.15805)\\n(262.0, 26.136131)\\n(263.0, 26.114313)\\n(264.0, 26.092398)\\n(265.0, 26.07059)\\n(266.0, 26.048866)\\n(267.0, 26.027222)\\n(268.0, 26.005672)\\n(269.0, 25.984198)\\n(270.0, 25.962818)\\n(271.0, 25.941511)\\n(272.0, 25.920292)\\n(273.0, 25.899149)\\n(274.0, 25.878073)\\n(275.0, 25.857077)\\n(276.0, 25.836159)\\n(277.0, 25.815313)\\n(278.0, 25.79454)\\n(279.0, 25.773834)\\n(280.0, 25.753197)\\n(281.0, 25.732632)\\n(282.0, 25.712118)\\n(283.0, 25.69169)\\n(284.0, 25.671312)\\n(285.0, 25.650997)\\n(286.0, 25.630754)\\n(287.0, 25.610567)\\n(288.0, 25.590443)\\n(289.0, 25.570374)\\n(290.0, 25.550362)\\n(291.0, 25.530415)\\n(292.0, 25.510513)\\n(293.0, 25.490673)\\n(294.0, 25.470884)\\n(295.0, 25.45116)\\n(296.0, 25.43148)\\n(297.0, 25.411856)\\n(298.0, 25.392282)\\n(299.0, 25.372755)\\n(300.0, 25.35328)\\n(301.0, 25.333855)\\n(302.0, 25.314478)\\n(303.0, 25.295149)\\n(304.0, 25.275866)\\n(305.0, 25.256632)\\n(306.0, 25.237436)\\n(307.0, 25.218304)\\n(308.0, 25.199202)\\n(309.0, 25.180145)\\n(310.0, 25.161135)\\n(311.0, 25.142172)\\n(312.0, 25.120897)\\n(313.0, 25.104315)\\n(314.0, 25.083319)\\n(315.0, 25.06655)\\n(316.0, 25.04711)\\n(317.0, 25.02894)\\n(318.0, 25.01003)\\n(319.0, 24.991453)\\n(320.0, 24.973057)\\n(321.0, 24.954514)\\n(322.0, 24.936192)\\n(323.0, 24.917665)\\n(324.0, 24.899292)\\n(325.0, 24.878494)\\n(326.0, 24.862473)\\n(327.0, 24.841913)\\n(328.0, 24.82567)\\n(329.0, 24.806768)\\n(330.0, 24.789011)\\n(331.0, 24.770657)\\n(332.0, 24.752491)\\n(333.0, 24.734606)\\n(334.0, 24.7165)\\n(335.0, 24.698639)\\n(336.0, 24.678196)\\n(337.0, 24.662584)\\n(338.0, 24.642336)\\n(339.0, 24.626472)\\n(340.0, 24.60799)\\n(341.0, 24.590534)\\n(342.0, 24.572594)\\n(343.0, 24.554739)\\n(344.0, 24.53724)\\n(345.0, 24.519459)\\n(346.0, 24.49958)\\n(347.0, 24.484154)\\n(348.0, 24.464287)\\n(349.0, 24.448668)\\n(350.0, 24.430534)\\n(351.0, 24.413334)\\n(352.0, 24.395744)\\n(353.0, 24.37815)\\n(354.0, 24.360962)\\n(355.0, 24.343464)\\n(356.0, 24.32386)\\n(357.0, 24.308704)\\n(358.0, 24.289124)\\n(359.0, 24.273758)\\n(360.0, 24.255941)\\n(361.0, 24.238949)\\n(362.0, 24.22168)\\n(363.0, 24.204332)\\n(364.0, 24.187433)\\n(365.0, 24.167816)\\n(366.0, 24.15316)\\n(367.0, 24.133715)\\n(368.0, 24.118605)\\n(369.0, 24.10106)\\n(370.0, 24.08423)\\n(371.0, 24.067265)\\n(372.0, 24.0501)\\n(373.0, 24.031078)\\n(374.0, 24.01639)\\n(375.0, 23.997385)\\n(376.0, 23.98236)\\n(377.0, 23.965086)\\n(378.0, 23.94841)\\n(379.0, 23.931664)\\n(380.0, 23.914698)\\n(381.0, 23.895866)\\n(382.0, 23.881367)\\n(383.0, 23.86255)\\n(384.0, 23.847694)\\n(385.0, 23.830647)\\n(386.0, 23.814112)\\n(387.0, 23.797592)\\n(388.0, 23.780787)\\n(389.0, 23.762156)\\n(390.0, 23.74782)\\n(391.0, 23.7292)\\n(392.0, 23.714485)\\n(393.0, 23.697662)\\n(394.0, 23.68126)\\n(395.0, 23.664957)\\n(396.0, 23.645924)\\n(397.0, 23.632206)\\n(398.0, 23.613497)\\n(399.0, 23.599113)\\n(400.0, 23.582445)\\n(401.0, 23.566141)\\n(402.0, 23.550053)\\n(403.0, 23.531122)\\n(404.0, 23.517601)\\n(405.0, 23.499043)\\n(406.0, 23.484777)\\n(407.0, 23.468304)\\n(408.0, 23.452099)\\n(409.0, 23.436203)\\n(410.0, 23.417406)\\n(411.0, 23.40401)\\n(412.0, 23.385632)\\n(413.0, 23.37146)\\n(414.0, 23.35519)\\n(415.0, 23.339071)\\n(416.0, 23.320948)\\n(417.0, 23.30703)\\n(418.0, 23.289259)\\n(419.0, 23.274988)\\n(420.0, 23.259018)\\n(421.0, 23.243015)\\n(422.0, 23.224968)\\n(423.0, 23.211235)\\n(424.0, 23.193546)\\n(425.0, 23.179415)\\n(426.0, 23.163586)\\n(427.0, 23.14767)\\n(428.0, 23.12978)\\n(429.0, 23.116135)\\n(430.0, 23.098595)\\n(431.0, 23.084549)\\n(432.0, 23.068882)\\n(433.0, 23.053055)\\n(434.0, 23.035305)\\n(435.0, 23.021755)\\n(436.0, 23.00435)\\n(437.0, 22.9904)\\n(438.0, 22.974876)\\n(439.0, 22.956715)\\n(440.0, 22.943897)\\n(441.0, 22.923948)\\n(442.0, 22.906906)\\n(443.0, 22.890488)\\n(444.0, 22.873043)\\n(445.0, 22.857544)\\n(446.0, 22.840357)\\n(447.0, 22.823793)\\n(448.0, 22.809107)\\n(449.0, 22.793236)\\n(450.0, 22.77912)\\n(451.0, 22.762672)\\n(452.0, 22.747843)\\n(453.0, 22.730606)\\n(454.0, 22.717327)\\n(455.0, 22.701193)\\n(456.0, 22.685354)\\n(457.0, 22.670223)\\n(458.0, 22.655508)\\n(459.0, 22.640131)\\n(460.0, 22.624937)\\n(461.0, 22.607918)\\n(462.0, 22.594961)\\n(463.0, 22.577322)\\n(464.0, 22.564491)\\n(465.0, 22.548462)\\n(466.0, 22.531822)\\n(467.0, 22.518866)\\n(468.0, 22.502504)\\n(469.0, 22.486551)\\n(470.0, 22.473547)\\n(471.0, 22.455797)\\n(472.0, 22.443228)\\n(473.0, 22.427263)\\n(474.0, 22.41086)\\n(475.0, 22.398018)\\n(476.0, 22.381836)\\n(477.0, 22.365965)\\n(478.0, 22.352936)\\n(479.0, 22.336916)\\n(480.0, 22.32106)\\n(481.0, 22.30821)\\n(482.0, 22.29072)\\n(483.0, 22.278328)\\n(484.0, 22.262655)\\n(485.0, 22.24632)\\n(486.0, 22.233814)\\n(487.0, 22.2178)\\n(488.0, 22.202124)\\n(489.0, 22.189537)\\n(490.0, 22.17208)\\n(491.0, 22.159922)\\n(492.0, 22.14439)\\n(493.0, 22.128138)\\n(494.0, 22.11591)\\n(495.0, 22.099993)\\n(496.0, 22.084505)\\n(497.0, 22.072044)\\n(498.0, 22.05472)\\n(499.0, 22.042782)\\n(500.0, 22.027395)\\n};\\n\\\\addlegendentry{Loss function}\\n\\\\end{axis}\\n\", \"\", \"\\\\usepackage{pgfplots}\\n\\\\pgfplotsset{compat=newest}\\n\\\\pgfplotsset{every axis legend/.append style={%\\ncells={anchor=west}}\\n}\\n\\\\usepgfplotslibrary{polar}\\n\\\\usetikzlibrary{arrows}\\n\\\\tikzset{>=stealth'}\\n\", true, true)" - ] - }, - "execution_count": 16, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# Plot learning curve: loss vs. epoch, skip the first 10 epochs\n", - "knet_learningcurve_lossvsepoch_skip10epochs = PredictMD.plotlearningcurve(\n", - " knetmlpreg,\n", - " :lossvsepoch;\n", - " startat = 10,\n", - " endat = :end,\n", - " )" - ] - }, - { - "cell_type": "code", - "execution_count": 17, - "metadata": {}, - "outputs": [ - { - "data": { - "image/svg+xml": [ - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - " \n", - "\n", - "\n", - " \n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - " \n", - "\n", - "\n", - " \n", - "\n", - "\n", - " \n", - "\n", - "\n", - " \n", - " \n", - " \n", - "\n", - "\n", - " \n", - "\n", - "\n", - " \n", - "\n", - "\n", - " \n", - " \n", - " \n", - "\n", - "\n", - " \n", - "\n", - "\n", - " \n", - "\n", - "\n", - " \n", - " \n", - " \n", - "\n", - "\n", - " \n", - "\n", - "\n", - " \n", - " \n", - " \n", - "\n", - "\n", - " \n", - " \n", - " \n", - "\n", - "\n", - " \n", - " \n", - " \n", - "\n", - "\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - "\n", - "\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "\n", - "\n", - "\n", - "\n" - ], - "text/plain": [ - "TikzPictures.TikzPicture(\"\\\\begin{axis}[legend pos = {north east}, ylabel = {Loss}, xlabel = {Iteration}]\\\\addplot+ [mark = {none}]coordinates {\\n(1.0, 313.86624)\\n(11.0, 119.663414)\\n(21.0, 135.32416)\\n(31.0, 114.049706)\\n(41.0, 91.40281)\\n(51.0, 99.262825)\\n(61.0, 80.18047)\\n(71.0, 96.24615)\\n(81.0, 36.03404)\\n(91.0, 61.47792)\\n(101.0, 73.726006)\\n(111.0, 53.580597)\\n(121.0, 72.03362)\\n(131.0, 59.47449)\\n(141.0, 78.522736)\\n(151.0, 35.46438)\\n(161.0, 54.6862)\\n(171.0, 69.8697)\\n(181.0, 47.036198)\\n(191.0, 67.04252)\\n(201.0, 52.454395)\\n(211.0, 75.135216)\\n(221.0, 30.466108)\\n(231.0, 49.864445)\\n(241.0, 66.43721)\\n(251.0, 41.598736)\\n(261.0, 62.87541)\\n(271.0, 48.55667)\\n(281.0, 72.11381)\\n(291.0, 28.79484)\\n(301.0, 46.449173)\\n(311.0, 63.28807)\\n(321.0, 38.312542)\\n(331.0, 59.85079)\\n(341.0, 46.316864)\\n(351.0, 69.21533)\\n(361.0, 27.490644)\\n(371.0, 43.91917)\\n(381.0, 60.510056)\\n(391.0, 36.101273)\\n(401.0, 56.985195)\\n(411.0, 44.009552)\\n(421.0, 66.59302)\\n(431.0, 26.072496)\\n(441.0, 41.45106)\\n(451.0, 57.646454)\\n(461.0, 33.978886)\\n(471.0, 54.09479)\\n(481.0, 41.795433)\\n(491.0, 63.96741)\\n(501.0, 24.855085)\\n(511.0, 38.969593)\\n(521.0, 54.637356)\\n(531.0, 31.945383)\\n(541.0, 51.18637)\\n(551.0, 39.665455)\\n(561.0, 61.3235)\\n(571.0, 23.864426)\\n(581.0, 36.5203)\\n(591.0, 51.520107)\\n(601.0, 30.028475)\\n(611.0, 48.29855)\\n(621.0, 37.63811)\\n(631.0, 58.686478)\\n(641.0, 23.166674)\\n(651.0, 34.145634)\\n(661.0, 48.363224)\\n(671.0, 28.316874)\\n(681.0, 45.46829)\\n(691.0, 35.6623)\\n(701.0, 55.783817)\\n(711.0, 22.741755)\\n(721.0, 31.869585)\\n(731.0, 45.40376)\\n(741.0, 26.77564)\\n(751.0, 42.787838)\\n(761.0, 33.936523)\\n(771.0, 53.25423)\\n(781.0, 22.674362)\\n(791.0, 29.883736)\\n(801.0, 42.40569)\\n(811.0, 25.43984)\\n(821.0, 40.34223)\\n(831.0, 32.418518)\\n(841.0, 50.937595)\\n(851.0, 22.898699)\\n(861.0, 28.16136)\\n(871.0, 39.640484)\\n(881.0, 24.39772)\\n(891.0, 38.15173)\\n(901.0, 31.119627)\\n(911.0, 48.86731)\\n(921.0, 23.341286)\\n(931.0, 26.730074)\\n(941.0, 37.154446)\\n(951.0, 23.561811)\\n(961.0, 36.254337)\\n(971.0, 30.011316)\\n(981.0, 47.10204)\\n(991.0, 23.933474)\\n(1001.0, 25.595144)\\n(1011.0, 34.99144)\\n(1021.0, 22.92366)\\n(1031.0, 34.643234)\\n(1041.0, 29.086191)\\n(1051.0, 45.588394)\\n(1061.0, 24.57969)\\n(1071.0, 24.727224)\\n(1081.0, 33.139534)\\n(1091.0, 22.458504)\\n(1101.0, 33.302128)\\n(1111.0, 28.303185)\\n(1121.0, 44.357983)\\n(1131.0, 25.205772)\\n(1141.0, 24.078627)\\n(1151.0, 31.564386)\\n(1161.0, 22.103903)\\n(1171.0, 32.195187)\\n(1181.0, 27.627762)\\n(1191.0, 43.29082)\\n(1201.0, 25.783873)\\n(1211.0, 23.592293)\\n(1221.0, 30.284964)\\n(1231.0, 21.849699)\\n(1241.0, 31.280996)\\n(1251.0, 27.01908)\\n(1261.0, 42.405075)\\n(1271.0, 26.24704)\\n(1281.0, 23.237406)\\n(1291.0, 29.226202)\\n(1301.0, 21.636091)\\n(1311.0, 30.531784)\\n(1321.0, 26.479431)\\n(1331.0, 41.678967)\\n(1341.0, 26.59752)\\n(1351.0, 22.980865)\\n(1361.0, 28.323252)\\n(1371.0, 21.447329)\\n(1381.0, 29.923872)\\n(1391.0, 25.986038)\\n(1401.0, 41.07056)\\n(1411.0, 26.848837)\\n(1421.0, 22.78505)\\n(1431.0, 27.558922)\\n(1441.0, 21.279188)\\n(1451.0, 29.416847)\\n(1461.0, 25.528152)\\n(1471.0, 40.544937)\\n(1481.0, 27.014069)\\n(1491.0, 22.62945)\\n(1501.0, 26.909937)\\n(1511.0, 21.125338)\\n(1521.0, 28.987991)\\n(1531.0, 25.097776)\\n(1541.0, 40.082836)\\n(1551.0, 27.106478)\\n(1561.0, 22.499765)\\n(1571.0, 26.35344)\\n(1581.0, 20.98028)\\n(1591.0, 28.619873)\\n(1601.0, 24.690437)\\n(1611.0, 39.670246)\\n(1621.0, 27.139587)\\n(1631.0, 22.385918)\\n(1641.0, 25.870443)\\n(1651.0, 20.840448)\\n(1661.0, 28.298904)\\n(1671.0, 24.30317)\\n(1681.0, 39.296337)\\n(1691.0, 27.125816)\\n(1701.0, 22.280973)\\n(1711.0, 25.445793)\\n(1721.0, 20.703712)\\n(1731.0, 28.014605)\\n(1741.0, 23.933783)\\n(1751.0, 38.952827)\\n(1761.0, 27.075724)\\n(1771.0, 22.18035)\\n(1781.0, 25.067553)\\n(1791.0, 20.568766)\\n(1801.0, 27.758867)\\n(1811.0, 23.580475)\\n(1821.0, 38.63329)\\n(1831.0, 26.998041)\\n(1841.0, 22.081026)\\n(1851.0, 24.726389)\\n(1861.0, 20.434946)\\n(1871.0, 27.525543)\\n(1881.0, 23.241693)\\n(1891.0, 38.332787)\\n(1901.0, 26.89969)\\n(1911.0, 21.981127)\\n(1921.0, 24.415022)\\n(1931.0, 20.30184)\\n(1941.0, 27.309921)\\n(1951.0, 22.91609)\\n(1961.0, 38.047565)\\n(1971.0, 26.786087)\\n(1981.0, 21.879519)\\n(1991.0, 24.127806)\\n(2001.0, 20.16992)\\n(2011.0, 27.108385)\\n(2021.0, 22.602415)\\n(2031.0, 37.774715)\\n(2041.0, 26.6614)\\n(2051.0, 21.775583)\\n(2061.0, 23.860296)\\n(2071.0, 20.039495)\\n(2081.0, 26.918213)\\n(2091.0, 22.299591)\\n(2101.0, 37.51195)\\n(2111.0, 26.528807)\\n(2121.0, 21.669043)\\n(2131.0, 23.60904)\\n(2141.0, 19.909348)\\n(2151.0, 26.737251)\\n(2161.0, 22.00657)\\n(2171.0, 37.25749)\\n(2181.0, 26.390741)\\n(2191.0, 21.557068)\\n(2201.0, 23.390928)\\n(2211.0, 19.791052)\\n(2221.0, 26.557589)\\n(2231.0, 21.720617)\\n(2241.0, 37.003036)\\n(2251.0, 26.246796)\\n(2261.0, 21.448954)\\n(2271.0, 23.148926)\\n(2281.0, 19.63516)\\n(2291.0, 26.388718)\\n(2301.0, 21.446524)\\n(2311.0, 36.75658)\\n(2321.0, 26.103775)\\n(2331.0, 21.334938)\\n(2341.0, 22.935982)\\n(2351.0, 19.523363)\\n(2361.0, 26.230698)\\n(2371.0, 21.178694)\\n(2381.0, 36.518883)\\n(2391.0, 25.955566)\\n(2401.0, 21.218773)\\n(2411.0, 22.73044)\\n(2421.0, 19.394484)\\n(2431.0, 26.073755)\\n(2441.0, 20.917225)\\n(2451.0, 36.286194)\\n(2461.0, 25.808731)\\n(2471.0, 21.100763)\\n(2481.0, 22.530174)\\n(2491.0, 19.266949)\\n(2501.0, 25.920866)\\n(2511.0, 20.662354)\\n(2521.0, 36.057888)\\n(2531.0, 25.662592)\\n(2541.0, 20.980875)\\n(2551.0, 22.335869)\\n(2561.0, 19.121769)\\n(2571.0, 25.767944)\\n(2581.0, 20.412678)\\n(2591.0, 35.83189)\\n(2601.0, 25.518303)\\n(2611.0, 20.861252)\\n(2621.0, 22.170975)\\n(2631.0, 19.01291)\\n(2641.0, 25.621908)\\n(2651.0, 20.16757)\\n(2661.0, 35.611748)\\n(2671.0, 25.384817)\\n(2681.0, 20.735481)\\n(2691.0, 21.97281)\\n(2701.0, 18.889093)\\n(2711.0, 25.48339)\\n(2721.0, 19.929289)\\n(2731.0, 35.38044)\\n(2741.0, 25.236109)\\n(2751.0, 20.609682)\\n(2761.0, 21.79069)\\n(2771.0, 18.76502)\\n(2781.0, 25.337973)\\n(2791.0, 19.697853)\\n(2801.0, 35.17404)\\n(2811.0, 25.089685)\\n(2821.0, 20.490276)\\n(2831.0, 21.635172)\\n(2841.0, 18.640432)\\n(2851.0, 25.19678)\\n(2861.0, 19.466917)\\n(2871.0, 34.93639)\\n(2881.0, 24.9617)\\n(2891.0, 20.35883)\\n(2901.0, 21.442163)\\n(2911.0, 18.521025)\\n(2921.0, 25.060448)\\n(2931.0, 19.243958)\\n(2941.0, 34.746765)\\n(2951.0, 24.813917)\\n(2961.0, 20.236435)\\n(2971.0, 21.287125)\\n(2981.0, 18.405294)\\n(2991.0, 24.926834)\\n(3001.0, 19.016335)\\n(3011.0, 34.51958)\\n(3021.0, 24.678228)\\n(3031.0, 20.108534)\\n(3041.0, 21.132662)\\n(3051.0, 18.277847)\\n(3061.0, 24.783506)\\n(3071.0, 18.808496)\\n(3081.0, 34.312763)\\n(3091.0, 24.586138)\\n(3101.0, 19.981928)\\n(3111.0, 21.072523)\\n(3121.0, 18.1998)\\n(3131.0, 24.462622)\\n(3141.0, 18.570927)\\n(3151.0, 33.977917)\\n(3161.0, 24.454895)\\n(3171.0, 19.873302)\\n(3181.0, 20.975105)\\n(3191.0, 18.081253)\\n(3201.0, 24.3471)\\n(3211.0, 18.361168)\\n(3221.0, 33.76693)\\n(3231.0, 24.294561)\\n(3241.0, 19.75377)\\n(3251.0, 20.831343)\\n(3261.0, 17.963343)\\n(3271.0, 24.236174)\\n(3281.0, 18.150732)\\n(3291.0, 33.56369)\\n(3301.0, 24.13978)\\n(3311.0, 19.633541)\\n(3321.0, 20.680338)\\n(3331.0, 17.842897)\\n(3341.0, 24.109001)\\n(3351.0, 17.94522)\\n(3361.0, 33.344692)\\n(3371.0, 24.010622)\\n(3381.0, 19.509325)\\n(3391.0, 20.535295)\\n(3401.0, 17.722319)\\n(3411.0, 23.978928)\\n(3421.0, 17.743357)\\n(3431.0, 33.1523)\\n(3441.0, 23.87459)\\n(3451.0, 19.392199)\\n(3461.0, 20.391563)\\n(3471.0, 17.60779)\\n(3481.0, 23.84655)\\n(3491.0, 17.553146)\\n};\\n\\\\addlegendentry{Loss function}\\n\\\\addplot+ [mark = {none}]coordinates {\\n(1.0, 64.51263)\\n(11.0, 64.021416)\\n(21.0, 63.844357)\\n(31.0, 63.253635)\\n(41.0, 63.03423)\\n(51.0, 62.616932)\\n(61.0, 62.59424)\\n(71.0, 61.926487)\\n(81.0, 61.495876)\\n(91.0, 61.32961)\\n(101.0, 60.81648)\\n(111.0, 60.61458)\\n(121.0, 60.24987)\\n(131.0, 60.225445)\\n(141.0, 59.65531)\\n(151.0, 59.2688)\\n(161.0, 59.10603)\\n(171.0, 58.653248)\\n(181.0, 58.462162)\\n(191.0, 58.13645)\\n(201.0, 58.103317)\\n(211.0, 57.612183)\\n(221.0, 57.259544)\\n(231.0, 57.099327)\\n(241.0, 56.69501)\\n(251.0, 56.51202)\\n(261.0, 56.218834)\\n(271.0, 56.18083)\\n(281.0, 55.756695)\\n(291.0, 55.43328)\\n(301.0, 55.27245)\\n(311.0, 54.908638)\\n(321.0, 54.73314)\\n(331.0, 54.467487)\\n(341.0, 54.42596)\\n(351.0, 54.059364)\\n(361.0, 53.761684)\\n(371.0, 53.60122)\\n(381.0, 53.27309)\\n(391.0, 53.105072)\\n(401.0, 52.863476)\\n(411.0, 52.820038)\\n(421.0, 52.50306)\\n(431.0, 52.228878)\\n(441.0, 52.070198)\\n(451.0, 51.77324)\\n(461.0, 51.61325)\\n(471.0, 51.392826)\\n(481.0, 51.349487)\\n(491.0, 51.07533)\\n(501.0, 50.823048)\\n(511.0, 48.061905)\\n(521.0, 47.10409)\\n(531.0, 46.107243)\\n(541.0, 45.26603)\\n(551.0, 44.812416)\\n(561.0, 44.072983)\\n(571.0, 43.523937)\\n(581.0, 42.899113)\\n(591.0, 42.764706)\\n(601.0, 42.485744)\\n(611.0, 42.03601)\\n(621.0, 41.9447)\\n(631.0, 41.48105)\\n(641.0, 41.1306)\\n(651.0, 40.66566)\\n(661.0, 40.533386)\\n(671.0, 40.3107)\\n(681.0, 39.892464)\\n(691.0, 39.85538)\\n(701.0, 39.44688)\\n(711.0, 39.161118)\\n(721.0, 38.717052)\\n(731.0, 38.63174)\\n(741.0, 38.447742)\\n(751.0, 38.05747)\\n(761.0, 38.06545)\\n(771.0, 37.702793)\\n(781.0, 37.452106)\\n(791.0, 37.02748)\\n(801.0, 36.9566)\\n(811.0, 36.799004)\\n(821.0, 36.43456)\\n(831.0, 36.46789)\\n(841.0, 36.13865)\\n(851.0, 35.9076)\\n(861.0, 35.502728)\\n(871.0, 35.442894)\\n(881.0, 35.304325)\\n(891.0, 34.962505)\\n(901.0, 35.011707)\\n(911.0, 34.713326)\\n(921.0, 34.503185)\\n(931.0, 34.11671)\\n(941.0, 34.06925)\\n(951.0, 33.9501)\\n(961.0, 33.63209)\\n(971.0, 33.697105)\\n(981.0, 33.428978)\\n(991.0, 33.239216)\\n(1001.0, 32.87231)\\n(1011.0, 32.83538)\\n(1021.0, 32.736553)\\n(1031.0, 32.44408)\\n(1041.0, 32.52465)\\n(1051.0, 32.286236)\\n(1061.0, 32.11628)\\n(1071.0, 31.77004)\\n(1081.0, 31.741486)\\n(1091.0, 31.663265)\\n(1101.0, 31.397623)\\n(1111.0, 31.493086)\\n(1121.0, 31.283592)\\n(1131.0, 31.13258)\\n(1141.0, 30.80767)\\n(1151.0, 30.784637)\\n(1161.0, 30.726746)\\n(1171.0, 30.488527)\\n(1181.0, 30.59724)\\n(1191.0, 30.41563)\\n(1201.0, 30.28314)\\n(1211.0, 29.982765)\\n(1221.0, 29.96259)\\n(1231.0, 29.924417)\\n(1241.0, 29.711843)\\n(1251.0, 29.832413)\\n(1261.0, 29.676847)\\n(1271.0, 29.56045)\\n(1281.0, 29.281374)\\n(1291.0, 29.260527)\\n(1301.0, 29.239487)\\n(1311.0, 29.053099)\\n(1321.0, 29.183727)\\n(1331.0, 29.051607)\\n(1341.0, 28.949255)\\n(1351.0, 28.689735)\\n(1361.0, 28.665344)\\n(1371.0, 28.65905)\\n(1381.0, 28.496685)\\n(1391.0, 28.634657)\\n(1401.0, 28.523249)\\n(1411.0, 28.43277)\\n(1421.0, 28.190666)\\n(1431.0, 28.160574)\\n(1441.0, 28.166313)\\n(1451.0, 28.02534)\\n(1461.0, 28.168764)\\n(1471.0, 28.075018)\\n(1481.0, 27.994509)\\n(1491.0, 27.767038)\\n(1501.0, 27.729778)\\n(1511.0, 27.744757)\\n(1521.0, 27.622091)\\n(1531.0, 27.769133)\\n(1541.0, 27.690104)\\n(1551.0, 27.617723)\\n(1561.0, 27.402596)\\n(1571.0, 27.357645)\\n(1581.0, 27.379333)\\n(1591.0, 27.272009)\\n(1601.0, 27.421051)\\n(1611.0, 27.353989)\\n(1621.0, 27.288305)\\n(1631.0, 27.08287)\\n(1641.0, 27.030434)\\n(1651.0, 27.056757)\\n(1661.0, 26.962122)\\n(1671.0, 27.11216)\\n(1681.0, 27.054688)\\n(1691.0, 26.994583)\\n(1701.0, 26.797552)\\n(1711.0, 26.738216)\\n(1721.0, 26.767576)\\n(1731.0, 26.682781)\\n(1741.0, 26.832815)\\n(1751.0, 26.782974)\\n(1761.0, 26.72782)\\n(1771.0, 26.537167)\\n(1781.0, 26.471706)\\n(1791.0, 26.502905)\\n(1801.0, 26.425879)\\n(1811.0, 26.575584)\\n(1821.0, 26.531744)\\n(1831.0, 26.480806)\\n(1841.0, 26.295233)\\n(1851.0, 26.225195)\\n(1861.0, 26.25737)\\n(1871.0, 26.186632)\\n(1881.0, 26.335855)\\n(1891.0, 26.296564)\\n(1901.0, 26.249361)\\n(1911.0, 26.06778)\\n(1921.0, 25.99397)\\n(1931.0, 26.026533)\\n(1941.0, 25.960775)\\n(1951.0, 26.109358)\\n(1961.0, 26.073637)\\n(1971.0, 26.0298)\\n(1981.0, 25.851439)\\n(1991.0, 25.774733)\\n(2001.0, 25.80732)\\n(2011.0, 25.74546)\\n(2021.0, 25.893309)\\n(2031.0, 25.860386)\\n(2041.0, 25.819624)\\n(2051.0, 25.643913)\\n(2061.0, 25.564857)\\n(2071.0, 25.597212)\\n(2081.0, 25.538391)\\n(2091.0, 25.685438)\\n(2101.0, 25.654728)\\n(2111.0, 25.616816)\\n(2121.0, 25.443558)\\n(2131.0, 25.363094)\\n(2141.0, 25.395134)\\n(2151.0, 25.338669)\\n(2161.0, 25.484919)\\n(2171.0, 25.45607)\\n(2181.0, 25.420746)\\n(2191.0, 25.249226)\\n(2201.0, 25.167677)\\n(2211.0, 25.199383)\\n(2221.0, 25.14476)\\n(2231.0, 25.290075)\\n(2241.0, 25.262562)\\n(2251.0, 25.229652)\\n(2261.0, 25.059732)\\n(2271.0, 24.977448)\\n(2281.0, 25.00871)\\n(2291.0, 24.955547)\\n(2301.0, 25.10015)\\n(2311.0, 25.073723)\\n(2321.0, 25.043129)\\n(2331.0, 24.87483)\\n(2341.0, 24.792082)\\n(2351.0, 24.822931)\\n(2361.0, 24.770853)\\n(2371.0, 24.914434)\\n(2381.0, 24.88905)\\n(2391.0, 24.860506)\\n(2401.0, 24.693275)\\n(2411.0, 24.610315)\\n(2421.0, 24.640806)\\n(2431.0, 24.589607)\\n(2441.0, 24.732628)\\n(2451.0, 24.707914)\\n(2461.0, 24.681381)\\n(2471.0, 24.515436)\\n(2481.0, 24.432457)\\n(2491.0, 24.46263)\\n(2501.0, 24.412022)\\n(2511.0, 24.554096)\\n(2521.0, 24.530037)\\n(2531.0, 24.505344)\\n(2541.0, 24.34057)\\n(2551.0, 24.257566)\\n(2561.0, 24.287346)\\n(2571.0, 24.237331)\\n(2581.0, 24.37865)\\n(2591.0, 24.35556)\\n(2601.0, 24.332611)\\n(2611.0, 24.169847)\\n(2621.0, 24.087381)\\n(2631.0, 24.115038)\\n(2641.0, 24.065157)\\n(2651.0, 24.204453)\\n(2661.0, 24.181854)\\n(2671.0, 24.160732)\\n(2681.0, 23.99952)\\n(2691.0, 23.917248)\\n(2701.0, 23.944872)\\n(2711.0, 23.895073)\\n(2721.0, 24.033445)\\n(2731.0, 24.01104)\\n(2741.0, 23.991564)\\n(2751.0, 23.83145)\\n(2761.0, 23.749434)\\n(2771.0, 23.777033)\\n(2781.0, 23.727545)\\n(2791.0, 23.86545)\\n(2801.0, 23.843187)\\n(2811.0, 23.825233)\\n(2821.0, 23.666063)\\n(2831.0, 23.584274)\\n(2841.0, 23.611738)\\n(2851.0, 23.562326)\\n(2861.0, 23.699167)\\n(2871.0, 23.67719)\\n(2881.0, 23.660662)\\n(2891.0, 23.50241)\\n(2901.0, 23.420893)\\n(2911.0, 23.448221)\\n(2921.0, 23.398844)\\n(2931.0, 23.535059)\\n(2941.0, 23.513283)\\n(2951.0, 23.498182)\\n(2961.0, 23.34081)\\n(2971.0, 23.259615)\\n(2981.0, 23.286802)\\n(2991.0, 23.237524)\\n(3001.0, 23.27723)\\n(3011.0, 23.250525)\\n(3021.0, 23.276936)\\n(3031.0, 23.145172)\\n(3041.0, 23.11895)\\n(3051.0, 23.141455)\\n(3061.0, 23.15003)\\n(3071.0, 23.19334)\\n(3081.0, 23.165356)\\n(3091.0, 23.195608)\\n(3101.0, 23.055202)\\n(3111.0, 23.027527)\\n(3121.0, 23.052143)\\n(3131.0, 23.062271)\\n(3141.0, 23.109356)\\n(3151.0, 23.079798)\\n(3161.0, 23.114468)\\n(3171.0, 22.963896)\\n(3181.0, 22.934374)\\n(3191.0, 22.961523)\\n(3201.0, 22.97388)\\n(3211.0, 23.025585)\\n(3221.0, 22.994076)\\n(3231.0, 23.033878)\\n(3241.0, 22.871424)\\n(3251.0, 22.839893)\\n(3261.0, 22.870033)\\n(3271.0, 22.884819)\\n(3281.0, 22.94204)\\n(3291.0, 22.908293)\\n(3301.0, 22.954155)\\n(3311.0, 22.777056)\\n(3321.0, 22.743046)\\n(3331.0, 22.776669)\\n(3341.0, 22.793964)\\n(3351.0, 22.857866)\\n(3361.0, 22.821321)\\n(3371.0, 22.874563)\\n(3381.0, 22.680017)\\n(3391.0, 22.642612)\\n(3401.0, 22.680676)\\n(3411.0, 22.701666)\\n(3421.0, 22.773748)\\n(3431.0, 22.733631)\\n(3441.0, 22.795948)\\n(3451.0, 22.578657)\\n(3461.0, 22.537266)\\n(3471.0, 22.580675)\\n(3481.0, 22.605553)\\n(3491.0, 22.68791)\\n};\\n\\\\addlegendentry{Loss function (smoothed)}\\n\\\\end{axis}\\n\", \"\", \"\\\\usepackage{pgfplots}\\n\\\\pgfplotsset{compat=newest}\\n\\\\pgfplotsset{every axis legend/.append style={%\\ncells={anchor=west}}\\n}\\n\\\\usepgfplotslibrary{polar}\\n\\\\usetikzlibrary{arrows}\\n\\\\tikzset{>=stealth'}\\n\", true, true)" - ] - }, - "execution_count": 17, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# Plot learning curve: loss vs. iteration\n", - "knet_learningcurve_lossvsiteration = PredictMD.plotlearningcurve(\n", - " knetmlpreg,\n", - " :lossvsiteration;\n", - " window = 50,\n", - " sampleevery = 10,\n", - " )" - ] - }, - { - "cell_type": "code", - "execution_count": 18, - "metadata": {}, - "outputs": [ - { - "data": { - "image/svg+xml": [ - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - " \n", - "\n", - "\n", - " \n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - " \n", - "\n", - "\n", - " \n", - "\n", - "\n", - " \n", - "\n", - "\n", - " \n", - " \n", - " \n", - "\n", - "\n", - " \n", - "\n", - "\n", - " \n", - "\n", - "\n", - " \n", - " \n", - " \n", - "\n", - "\n", - " \n", - "\n", - "\n", - " \n", - "\n", - "\n", - " \n", - " \n", - " \n", - "\n", - "\n", - " \n", - " \n", - "\n", - "\n", - " \n", - " \n", - "\n", - "\n", - " \n", - " \n", - "\n", - "\n", - " \n", - " \n", - "\n", - "\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - "\n", - "\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "\n", - "\n", - "\n", - "\n" - ], - "text/plain": [ - "TikzPictures.TikzPicture(\"\\\\begin{axis}[legend pos = {north east}, ylabel = {Loss}, xlabel = {Iteration}]\\\\addplot+ [mark = {none}]coordinates {\\n(100.0, 74.64759)\\n(110.0, 62.014874)\\n(120.0, 80.57115)\\n(130.0, 36.871254)\\n(140.0, 56.221115)\\n(150.0, 70.7974)\\n(160.0, 48.833294)\\n(170.0, 68.47111)\\n(180.0, 54.3818)\\n(190.0, 75.99148)\\n(200.0, 31.699387)\\n(210.0, 51.330685)\\n(220.0, 67.38707)\\n(230.0, 42.954563)\\n(240.0, 63.89613)\\n(250.0, 49.38704)\\n(260.0, 73.2438)\\n(270.0, 29.077698)\\n(280.0, 47.258987)\\n(290.0, 64.10876)\\n(300.0, 39.228275)\\n(310.0, 60.728207)\\n(320.0, 47.002914)\\n(330.0, 70.0215)\\n(340.0, 27.924417)\\n(350.0, 44.659042)\\n(360.0, 61.330242)\\n(370.0, 36.753323)\\n(380.0, 57.846058)\\n(390.0, 44.687885)\\n(400.0, 67.37218)\\n(410.0, 26.488558)\\n(420.0, 42.195225)\\n(430.0, 58.517426)\\n(440.0, 34.61604)\\n(450.0, 54.96446)\\n(460.0, 42.453663)\\n(470.0, 64.75191)\\n(480.0, 25.204676)\\n(490.0, 39.7143)\\n(500.0, 55.550274)\\n(510.0, 32.543377)\\n(520.0, 52.058475)\\n(530.0, 40.294415)\\n(540.0, 62.11764)\\n(550.0, 24.134218)\\n(560.0, 37.24825)\\n(570.0, 52.4635)\\n(580.0, 30.596586)\\n(590.0, 49.160397)\\n(600.0, 38.233067)\\n(610.0, 59.468792)\\n(620.0, 23.349474)\\n(630.0, 34.847576)\\n(640.0, 49.310883)\\n(650.0, 28.803604)\\n(660.0, 46.3201)\\n(670.0, 36.25176)\\n(680.0, 56.610474)\\n(690.0, 22.987656)\\n(700.0, 32.49642)\\n(710.0, 46.31162)\\n(720.0, 27.215189)\\n(730.0, 43.568207)\\n(740.0, 34.440205)\\n(750.0, 54.00268)\\n(760.0, 22.671587)\\n(770.0, 30.452408)\\n(780.0, 43.286255)\\n(790.0, 25.810448)\\n(800.0, 41.04581)\\n(810.0, 32.856106)\\n(820.0, 51.57895)\\n(830.0, 22.798256)\\n(840.0, 28.647558)\\n(850.0, 40.45229)\\n(860.0, 24.688461)\\n(870.0, 38.778996)\\n(880.0, 31.48128)\\n(890.0, 49.467117)\\n(900.0, 23.181406)\\n(910.0, 27.126207)\\n(920.0, 37.872524)\\n(930.0, 23.789436)\\n(940.0, 36.7916)\\n(950.0, 30.326296)\\n(960.0, 47.593216)\\n(970.0, 23.749125)\\n(980.0, 25.911066)\\n(990.0, 35.610363)\\n(1000.0, 23.104353)\\n(1010.0, 35.09602)\\n(1020.0, 29.353453)\\n(1030.0, 46.00664)\\n(1040.0, 24.390158)\\n(1050.0, 24.962036)\\n(1060.0, 33.669777)\\n(1070.0, 22.588934)\\n(1080.0, 33.67607)\\n(1090.0, 28.525587)\\n(1100.0, 44.6998)\\n(1110.0, 25.022774)\\n(1120.0, 24.255417)\\n(1130.0, 31.992563)\\n(1140.0, 22.199411)\\n(1150.0, 32.50806)\\n(1160.0, 27.81574)\\n(1170.0, 43.60433)\\n(1180.0, 25.62074)\\n(1190.0, 23.722654)\\n(1200.0, 30.647833)\\n(1210.0, 21.919737)\\n(1220.0, 31.534004)\\n(1230.0, 27.19295)\\n(1240.0, 42.664417)\\n(1250.0, 26.122835)\\n(1260.0, 23.332184)\\n(1270.0, 29.526)\\n(1280.0, 21.696074)\\n(1290.0, 30.738785)\\n(1300.0, 26.63605)\\n(1310.0, 41.88197)\\n(1320.0, 26.503527)\\n(1330.0, 23.050203)\\n(1340.0, 28.578735)\\n(1350.0, 21.501572)\\n(1360.0, 30.094397)\\n(1370.0, 26.129814)\\n(1380.0, 41.243317)\\n(1390.0, 26.78305)\\n(1400.0, 22.838799)\\n(1410.0, 27.774973)\\n(1420.0, 21.327805)\\n(1430.0, 29.559847)\\n(1440.0, 25.662363)\\n(1450.0, 40.69521)\\n(1460.0, 26.972769)\\n(1470.0, 22.672852)\\n(1480.0, 27.093956)\\n(1490.0, 21.17037)\\n(1500.0, 29.10955)\\n(1510.0, 25.224304)\\n(1520.0, 40.21569)\\n(1530.0, 27.085585)\\n(1540.0, 22.536634)\\n(1550.0, 26.511887)\\n(1560.0, 21.023117)\\n(1570.0, 28.724802)\\n(1580.0, 24.81041)\\n(1590.0, 39.78951)\\n(1600.0, 27.135118)\\n(1610.0, 22.41887)\\n(1620.0, 26.008593)\\n(1630.0, 20.882008)\\n(1640.0, 28.39093)\\n(1650.0, 24.41738)\\n(1660.0, 39.40499)\\n(1670.0, 27.134192)\\n(1680.0, 22.311834)\\n(1690.0, 25.567825)\\n(1700.0, 20.744503)\\n(1710.0, 28.096575)\\n(1720.0, 24.04283)\\n(1730.0, 39.053104)\\n(1740.0, 27.093996)\\n(1750.0, 22.210289)\\n(1760.0, 25.176752)\\n(1770.0, 20.60911)\\n(1780.0, 27.83299)\\n(1790.0, 23.684866)\\n(1800.0, 38.726933)\\n(1810.0, 27.023796)\\n(1820.0, 22.11081)\\n(1830.0, 24.825317)\\n(1840.0, 20.475)\\n(1850.0, 27.593485)\\n(1860.0, 23.341883)\\n(1870.0, 38.421173)\\n(1880.0, 26.931015)\\n(1890.0, 22.011238)\\n(1900.0, 24.505663)\\n(1910.0, 20.34171)\\n(1920.0, 27.372978)\\n(1930.0, 23.012472)\\n(1940.0, 38.13172)\\n(1950.0, 26.821508)\\n(1960.0, 21.910227)\\n(1970.0, 24.211708)\\n(1980.0, 20.209085)\\n(1990.0, 27.167543)\\n(2000.0, 22.695335)\\n(2010.0, 37.855427)\\n(2020.0, 26.699745)\\n(2030.0, 21.807032)\\n(2040.0, 23.938704)\\n(2050.0, 20.078592)\\n(2060.0, 26.974207)\\n(2070.0, 22.38935)\\n(2080.0, 37.589825)\\n(2090.0, 26.569254)\\n(2100.0, 21.70128)\\n(2110.0, 23.682882)\\n(2120.0, 19.948343)\\n(2130.0, 26.79067)\\n(2140.0, 22.093498)\\n(2150.0, 37.33304)\\n(2160.0, 26.432623)\\n(2170.0, 21.592865)\\n(2180.0, 23.441362)\\n(2190.0, 19.804766)\\n(2200.0, 26.60608)\\n(2210.0, 21.80691)\\n(2220.0, 37.072933)\\n(2230.0, 26.293142)\\n(2240.0, 21.482283)\\n(2250.0, 23.218016)\\n(2260.0, 19.688627)\\n(2270.0, 26.447126)\\n(2280.0, 21.52316)\\n(2290.0, 36.81198)\\n(2300.0, 26.146101)\\n(2310.0, 21.366693)\\n(2320.0, 23.00482)\\n(2330.0, 19.557543)\\n(2340.0, 26.28353)\\n(2350.0, 21.25719)\\n(2360.0, 36.58118)\\n(2370.0, 26.006144)\\n(2380.0, 21.250605)\\n(2390.0, 22.796995)\\n(2400.0, 19.428745)\\n(2410.0, 26.125124)\\n(2420.0, 20.993807)\\n(2430.0, 36.34689)\\n(2440.0, 25.859072)\\n(2450.0, 21.133211)\\n(2460.0, 22.59498)\\n(2470.0, 19.301008)\\n(2480.0, 25.971193)\\n(2490.0, 20.73709)\\n(2500.0, 36.11731)\\n(2510.0, 25.712563)\\n(2520.0, 21.013813)\\n(2530.0, 22.399)\\n(2540.0, 19.174227)\\n(2550.0, 25.820646)\\n(2560.0, 20.48142)\\n(2570.0, 35.883976)\\n(2580.0, 25.56035)\\n(2590.0, 20.893938)\\n(2600.0, 22.207321)\\n(2610.0, 19.050661)\\n(2620.0, 25.66764)\\n(2630.0, 20.242079)\\n(2640.0, 35.6764)\\n(2650.0, 25.416191)\\n(2660.0, 20.773785)\\n(2670.0, 22.04154)\\n(2680.0, 18.92655)\\n(2690.0, 25.519272)\\n(2700.0, 20.00255)\\n(2710.0, 35.45815)\\n(2720.0, 25.272903)\\n(2730.0, 20.649109)\\n(2740.0, 21.852104)\\n(2750.0, 18.807316)\\n(2760.0, 25.381655)\\n(2770.0, 19.764973)\\n(2780.0, 35.227154)\\n(2790.0, 25.137606)\\n(2800.0, 20.522429)\\n(2810.0, 21.66832)\\n(2820.0, 18.679058)\\n(2830.0, 25.240189)\\n(2840.0, 19.537216)\\n(2850.0, 35.02379)\\n(2860.0, 24.992212)\\n(2870.0, 20.402536)\\n(2880.0, 21.51467)\\n(2890.0, 18.554873)\\n(2900.0, 25.100348)\\n(2910.0, 19.309484)\\n(2920.0, 34.794994)\\n(2930.0, 24.860949)\\n(2940.0, 20.271204)\\n(2950.0, 21.327002)\\n(2960.0, 18.416323)\\n(2970.0, 24.962553)\\n(2980.0, 19.089767)\\n(2990.0, 34.597446)\\n(3000.0, 24.732866)\\n(3010.0, 20.147593)\\n(3020.0, 21.16484)\\n(3030.0, 18.318384)\\n(3040.0, 24.83132)\\n(3050.0, 18.864428)\\n(3060.0, 34.384727)\\n(3070.0, 24.58586)\\n(3080.0, 20.02096)\\n(3090.0, 21.062624)\\n(3100.0, 18.20466)\\n(3110.0, 24.565561)\\n(3120.0, 18.635656)\\n(3130.0, 34.02261)\\n(3140.0, 24.52159)\\n(3150.0, 19.899551)\\n(3160.0, 21.019434)\\n(3170.0, 18.120987)\\n(3180.0, 24.397814)\\n(3190.0, 18.412302)\\n(3200.0, 33.81991)\\n(3210.0, 24.32847)\\n(3220.0, 19.78956)\\n(3230.0, 20.88538)\\n(3240.0, 18.005459)\\n(3250.0, 24.27019)\\n(3260.0, 18.21184)\\n(3270.0, 33.620678)\\n(3280.0, 24.18815)\\n(3290.0, 19.669474)\\n(3300.0, 20.723047)\\n(3310.0, 17.881319)\\n(3320.0, 24.144835)\\n(3330.0, 18.0053)\\n(3340.0, 33.412838)\\n(3350.0, 24.04351)\\n(3360.0, 19.549166)\\n(3370.0, 20.583315)\\n(3380.0, 17.760424)\\n(3390.0, 24.013716)\\n(3400.0, 17.8089)\\n(3410.0, 33.221302)\\n(3420.0, 23.918213)\\n(3430.0, 19.428072)\\n(3440.0, 20.43717)\\n(3450.0, 17.644094)\\n(3460.0, 23.893988)\\n(3470.0, 17.607344)\\n(3480.0, 33.027195)\\n(3490.0, 23.787)\\n(3500.0, 19.306576)\\n};\\n\\\\addlegendentry{Loss function}\\n\\\\addplot+ [mark = {none}]coordinates {\\n(100.0, 50.392265)\\n(110.0, 50.566814)\\n(120.0, 50.053276)\\n(130.0, 49.771694)\\n(140.0, 49.763313)\\n(150.0, 49.389034)\\n(160.0, 49.335194)\\n(170.0, 49.109615)\\n(180.0, 49.23675)\\n(190.0, 48.799263)\\n(200.0, 48.532)\\n(210.0, 48.49619)\\n(220.0, 48.158394)\\n(230.0, 48.08667)\\n(240.0, 47.876724)\\n(250.0, 47.969547)\\n(260.0, 47.59197)\\n(270.0, 47.339916)\\n(280.0, 47.281166)\\n(290.0, 46.97444)\\n(300.0, 46.89094)\\n(310.0, 46.696014)\\n(320.0, 46.7629)\\n(330.0, 46.439056)\\n(340.0, 46.201836)\\n(350.0, 46.126186)\\n(360.0, 45.847775)\\n(370.0, 45.75715)\\n(380.0, 45.57644)\\n(390.0, 45.625072)\\n(400.0, 45.347992)\\n(410.0, 45.125774)\\n(420.0, 45.038387)\\n(430.0, 44.785423)\\n(440.0, 44.691376)\\n(450.0, 44.52434)\\n(460.0, 44.55962)\\n(470.0, 44.323135)\\n(480.0, 44.116257)\\n(490.0, 44.021744)\\n(500.0, 43.79188)\\n(510.0, 43.69736)\\n(520.0, 43.543125)\\n(530.0, 43.569332)\\n(540.0, 43.36745)\\n(550.0, 43.175724)\\n(560.0, 43.07773)\\n(570.0, 42.868656)\\n(580.0, 42.775806)\\n(590.0, 42.6333)\\n(600.0, 42.65376)\\n(610.0, 42.162422)\\n(620.0, 41.788574)\\n(630.0, 41.307594)\\n(640.0, 41.162327)\\n(650.0, 40.927547)\\n(660.0, 40.501984)\\n(670.0, 40.45021)\\n(680.0, 40.025955)\\n(690.0, 39.722397)\\n(700.0, 39.27345)\\n(710.0, 39.17662)\\n(720.0, 38.980614)\\n(730.0, 38.58265)\\n(740.0, 38.579777)\\n(750.0, 38.205784)\\n(760.0, 37.947815)\\n(770.0, 37.51497)\\n(780.0, 37.44188)\\n(790.0, 37.278313)\\n(800.0, 36.9073)\\n(810.0, 36.933575)\\n(820.0, 36.594715)\\n(830.0, 36.35756)\\n(840.0, 35.94723)\\n(850.0, 35.883644)\\n(860.0, 35.739433)\\n(870.0, 35.39092)\\n(880.0, 35.435375)\\n(890.0, 35.12782)\\n(900.0, 34.91149)\\n(910.0, 34.51944)\\n(920.0, 34.468346)\\n(930.0, 34.343243)\\n(940.0, 34.017944)\\n(950.0, 34.078136)\\n(960.0, 33.800987)\\n(970.0, 33.605137)\\n(980.0, 33.23229)\\n(990.0, 33.192345)\\n(1000.0, 33.08735)\\n(1010.0, 32.78709)\\n(1020.0, 32.863052)\\n(1030.0, 32.6158)\\n(1040.0, 32.43998)\\n(1050.0, 32.08745)\\n(1060.0, 32.056644)\\n(1070.0, 31.972252)\\n(1080.0, 31.69846)\\n(1090.0, 31.789478)\\n(1100.0, 31.571405)\\n(1110.0, 31.414831)\\n(1120.0, 31.083542)\\n(1130.0, 31.059113)\\n(1140.0, 30.995182)\\n(1150.0, 30.748716)\\n(1160.0, 30.85368)\\n(1170.0, 30.663721)\\n(1180.0, 30.5257)\\n(1190.0, 30.218346)\\n(1200.0, 30.196135)\\n(1210.0, 30.152576)\\n(1220.0, 29.93209)\\n(1230.0, 30.0493)\\n(1240.0, 29.886187)\\n(1250.0, 29.765099)\\n(1260.0, 29.479692)\\n(1270.0, 29.459272)\\n(1280.0, 29.433338)\\n(1290.0, 29.239267)\\n(1300.0, 29.367151)\\n(1310.0, 29.228317)\\n(1320.0, 29.12193)\\n(1330.0, 28.85704)\\n(1340.0, 28.834042)\\n(1350.0, 28.823603)\\n(1360.0, 28.654194)\\n(1370.0, 28.79016)\\n(1380.0, 28.672852)\\n(1390.0, 28.57909)\\n(1400.0, 28.331947)\\n(1410.0, 28.303831)\\n(1420.0, 28.306276)\\n(1430.0, 28.15915)\\n(1440.0, 28.301151)\\n(1450.0, 28.202436)\\n(1460.0, 28.11911)\\n(1470.0, 27.887608)\\n(1480.0, 27.852556)\\n(1490.0, 27.864996)\\n(1500.0, 27.737125)\\n(1510.0, 27.883175)\\n(1520.0, 27.800043)\\n(1530.0, 27.72533)\\n(1540.0, 27.506834)\\n(1550.0, 27.464146)\\n(1560.0, 27.484066)\\n(1570.0, 27.372377)\\n(1580.0, 27.520905)\\n(1590.0, 27.45054)\\n(1600.0, 27.382969)\\n(1610.0, 27.174883)\\n(1620.0, 27.124638)\\n(1630.0, 27.149742)\\n(1640.0, 27.051733)\\n(1650.0, 27.20157)\\n(1660.0, 27.141415)\\n(1670.0, 27.079803)\\n(1680.0, 26.88017)\\n(1690.0, 26.822584)\\n(1700.0, 26.851131)\\n(1710.0, 26.763601)\\n(1720.0, 26.913631)\\n(1730.0, 26.861746)\\n(1740.0, 26.805202)\\n(1750.0, 26.612665)\\n(1760.0, 26.548958)\\n(1770.0, 26.5798)\\n(1780.0, 26.500563)\\n(1790.0, 26.650227)\\n(1800.0, 26.604755)\\n(1810.0, 26.552582)\\n(1820.0, 26.365679)\\n(1830.0, 26.29691)\\n(1840.0, 26.328922)\\n(1850.0, 26.25643)\\n(1860.0, 26.405733)\\n(1870.0, 26.365255)\\n(1880.0, 26.316948)\\n(1890.0, 26.134308)\\n(1900.0, 26.061495)\\n(1910.0, 26.094032)\\n(1920.0, 26.026894)\\n(1930.0, 26.1756)\\n(1940.0, 26.138958)\\n(1950.0, 26.094112)\\n(1960.0, 25.914904)\\n(1970.0, 25.838945)\\n(1980.0, 25.871601)\\n(1990.0, 25.80866)\\n(2000.0, 25.956654)\\n(2010.0, 25.923018)\\n(2020.0, 25.881332)\\n(2030.0, 25.704927)\\n(2040.0, 25.626598)\\n(2050.0, 25.659113)\\n(2060.0, 25.599403)\\n(2070.0, 25.746544)\\n(2080.0, 25.715212)\\n(2090.0, 25.676434)\\n(2100.0, 25.502352)\\n(2110.0, 25.42231)\\n(2120.0, 25.454473)\\n(2130.0, 25.39738)\\n(2140.0, 25.543863)\\n(2150.0, 25.51441)\\n(2160.0, 25.478334)\\n(2170.0, 25.306414)\\n(2180.0, 25.225151)\\n(2190.0, 25.256908)\\n(2200.0, 25.201805)\\n(2210.0, 25.347486)\\n(2220.0, 25.319529)\\n(2230.0, 25.28593)\\n(2240.0, 25.115623)\\n(2250.0, 25.033575)\\n(2260.0, 25.064976)\\n(2270.0, 25.011393)\\n(2280.0, 25.156124)\\n(2290.0, 25.129438)\\n(2300.0, 25.098127)\\n(2310.0, 24.92923)\\n(2320.0, 24.846607)\\n(2330.0, 24.877592)\\n(2340.0, 24.825233)\\n(2350.0, 24.969282)\\n(2360.0, 24.943525)\\n(2370.0, 24.914425)\\n(2380.0, 24.747034)\\n(2390.0, 24.664104)\\n(2400.0, 24.694689)\\n(2410.0, 24.64324)\\n(2420.0, 24.786343)\\n(2430.0, 24.761469)\\n(2440.0, 24.734327)\\n(2450.0, 24.567944)\\n(2460.0, 24.484722)\\n(2470.0, 24.514946)\\n(2480.0, 24.464235)\\n(2490.0, 24.606697)\\n(2500.0, 24.58259)\\n(2510.0, 24.557364)\\n(2520.0, 24.392109)\\n(2530.0, 24.309126)\\n(2540.0, 24.339067)\\n(2550.0, 24.28883)\\n(2560.0, 24.430475)\\n(2570.0, 24.406828)\\n(2580.0, 24.383379)\\n(2590.0, 24.219744)\\n(2600.0, 24.136925)\\n(2610.0, 24.165285)\\n(2620.0, 24.115313)\\n(2630.0, 24.25466)\\n(2640.0, 24.232195)\\n(2650.0, 24.210474)\\n(2660.0, 24.048952)\\n(2670.0, 23.96666)\\n(2680.0, 23.994429)\\n(2690.0, 23.944635)\\n(2700.0, 24.083399)\\n(2710.0, 24.060846)\\n(2720.0, 24.040873)\\n(2730.0, 23.880604)\\n(2740.0, 23.798546)\\n(2750.0, 23.82615)\\n(2760.0, 23.776583)\\n(2770.0, 23.914524)\\n(2780.0, 23.89216)\\n(2790.0, 23.873804)\\n(2800.0, 23.71451)\\n(2810.0, 23.632679)\\n(2820.0, 23.660187)\\n(2830.0, 23.610687)\\n(2840.0, 23.747868)\\n(2850.0, 23.725689)\\n(2860.0, 23.708776)\\n(2870.0, 23.55038)\\n(2880.0, 23.468742)\\n(2890.0, 23.4961)\\n(2900.0, 23.446712)\\n(2910.0, 23.583271)\\n(2920.0, 23.561422)\\n(2930.0, 23.545918)\\n(2940.0, 23.3884)\\n(2950.0, 23.307058)\\n(2960.0, 23.334394)\\n(2970.0, 23.285013)\\n(2980.0, 23.420916)\\n(2990.0, 23.39929)\\n(3000.0, 23.385128)\\n(3010.0, 23.257807)\\n(3020.0, 23.23301)\\n(3030.0, 23.255653)\\n(3040.0, 23.26449)\\n(3050.0, 23.307096)\\n(3060.0, 23.280636)\\n(3070.0, 23.310415)\\n(3080.0, 23.175215)\\n(3090.0, 23.14929)\\n(3100.0, 23.174072)\\n(3110.0, 23.184813)\\n(3120.0, 23.231264)\\n(3130.0, 23.203577)\\n(3140.0, 23.23762)\\n(3150.0, 23.092981)\\n(3160.0, 23.065649)\\n(3170.0, 23.092934)\\n(3180.0, 23.1056)\\n(3190.0, 23.156565)\\n(3200.0, 23.127396)\\n(3210.0, 23.166454)\\n(3220.0, 23.010864)\\n(3230.0, 22.981863)\\n(3240.0, 23.012161)\\n(3250.0, 23.027424)\\n(3260.0, 23.08369)\\n(3270.0, 23.052639)\\n(3280.0, 23.097675)\\n(3290.0, 22.929209)\\n(3300.0, 22.898106)\\n(3310.0, 22.932041)\\n(3320.0, 22.950357)\\n(3330.0, 23.01317)\\n(3340.0, 22.979933)\\n(3350.0, 23.032097)\\n(3360.0, 22.847607)\\n(3370.0, 22.814098)\\n(3380.0, 22.852377)\\n(3390.0, 22.873953)\\n(3400.0, 22.944757)\\n(3410.0, 22.90883)\\n(3420.0, 22.969835)\\n(3430.0, 22.765955)\\n(3440.0, 22.729198)\\n(3450.0, 22.773092)\\n(3460.0, 22.799385)\\n(3470.0, 22.880554)\\n(3480.0, 22.841269)\\n(3490.0, 22.913414)\\n(3500.0, 22.684315)\\n};\\n\\\\addlegendentry{Loss function (smoothed)}\\n\\\\end{axis}\\n\", \"\", \"\\\\usepackage{pgfplots}\\n\\\\pgfplotsset{compat=newest}\\n\\\\pgfplotsset{every axis legend/.append style={%\\ncells={anchor=west}}\\n}\\n\\\\usepgfplotslibrary{polar}\\n\\\\usetikzlibrary{arrows}\\n\\\\tikzset{>=stealth'}\\n\", true, true)" - ] - }, - "execution_count": 18, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# Plot learning curve: loss vs. iteration, skip the first 100 iterations\n", - "knet_learningcurve_lossvsiteration_skip100iterations = PredictMD.plotlearningcurve(\n", - " knetmlpreg,\n", - " :lossvsiteration;\n", - " window = 50,\n", - " sampleevery = 10,\n", - " startat = 100,\n", - " endat = :end,\n", - " )" - ] - }, - { - "cell_type": "code", - "execution_count": 19, - "metadata": {}, - "outputs": [ - { - "data": { - "image/svg+xml": [ - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - " \n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - " \n", - " \n", - "\n", - "\n", - " \n", - " \n", - "\n", - "\n", - " \n", - " \n", - "\n", - "\n", - " \n", - " \n", - "\n", - "\n", - " \n", - " \n", - "\n", - "\n", - " \n", - "\n", - "\n", - " \n", - " \n", - "\n", - "\n", - " \n", - " \n", - "\n", - "\n", - " \n", - " \n", - "\n", - "\n", - " \n", - " \n", - "\n", - "\n", - " \n", - " \n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - " \n", - "\n", - "\n", - " \n", - " \n", - " \n", - "\n", - "\n", - " \n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "\n", - "\n", - " \n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - "\n", - "\n", - "\n", - "\n" - ], - "text/plain": [ - "TikzPictures.TikzPicture(\"\\\\begin{axis}[ylabel = {Predicted value}, xlabel = {True value}]\\\\addplot+ [only marks = {true}, black,fill=black]coordinates {\\n(17.3, 13.633374)\\n(18.5, 14.027961)\\n(14.9, 17.661993)\\n(31.1, 31.477072)\\n(12.6, 17.889978)\\n(29.6, 25.365662)\\n(23.1, 16.838312)\\n(19.1, 14.033111)\\n(7.0, 12.515892)\\n(19.4, 11.470875)\\n(21.6, 26.519703)\\n(14.2, 18.366602)\\n(29.6, 24.534632)\\n(21.8, 22.677124)\\n(11.9, 10.140813)\\n(12.1, 16.483543)\\n(50.0, 31.232746)\\n(22.2, 26.970234)\\n(22.9, 24.600481)\\n(48.3, 34.01281)\\n(21.5, 23.154716)\\n(21.6, 27.802807)\\n(36.5, 35.429005)\\n(19.6, 19.80763)\\n(19.9, 19.494987)\\n(16.3, 12.542684)\\n(21.4, 19.30463)\\n(19.1, 14.62912)\\n(26.6, 24.69612)\\n(37.2, 33.50107)\\n(12.5, 17.6983)\\n(20.4, 15.852963)\\n(22.6, 27.688303)\\n(28.2, 33.333347)\\n(13.4, 16.405468)\\n(23.3, 28.363417)\\n(42.8, 28.883318)\\n(21.2, 23.456478)\\n(22.9, 23.597334)\\n(50.0, 36.602318)\\n(15.1, 17.483824)\\n(19.5, 19.853487)\\n(21.7, 21.190184)\\n(23.3, 24.053566)\\n(14.1, 17.222303)\\n(20.5, 20.067467)\\n(14.0, 12.9400015)\\n(43.1, 36.474735)\\n(45.4, 36.155293)\\n(36.4, 31.803019)\\n(23.7, 25.89716)\\n(31.5, 32.085545)\\n(13.6, 13.323123)\\n(16.4, 18.464523)\\n(15.2, 18.14627)\\n(27.9, 33.51111)\\n(23.0, 29.905003)\\n(19.8, 19.554586)\\n(35.1, 33.831234)\\n(17.8, 8.79364)\\n(36.0, 35.77735)\\n(18.4, 18.539698)\\n(10.5, 7.532017)\\n(50.0, 33.83563)\\n(20.1, 16.564466)\\n(23.9, 25.279083)\\n(48.8, 39.04934)\\n(19.3, 21.922426)\\n(14.5, 16.937233)\\n(24.2, 23.555244)\\n(29.8, 21.391157)\\n(23.3, 27.21776)\\n(14.5, 20.182314)\\n(15.6, 18.558672)\\n(8.8, 8.9689665)\\n(19.6, 23.725609)\\n(21.9, 17.866838)\\n(15.0, 17.758213)\\n(32.0, 31.51969)\\n(20.1, 18.86515)\\n(7.2, 10.202974)\\n(23.5, 31.337078)\\n(33.0, 24.961067)\\n(26.6, 28.335194)\\n(5.0, 6.1450973)\\n(17.1, 20.819403)\\n(16.8, 22.085255)\\n(29.9, 31.027073)\\n(19.3, 20.012308)\\n(20.3, 22.60886)\\n(29.1, 30.404486)\\n(27.5, 30.520838)\\n(10.9, 15.72101)\\n(24.4, 29.544085)\\n(22.5, 17.447691)\\n(14.8, 10.271847)\\n(18.2, 21.729359)\\n(24.8, 24.273214)\\n(18.5, 22.724773)\\n(23.8, 26.742031)\\n(17.6, 17.37374)\\n(24.0, 23.894468)\\n(17.5, 19.109713)\\n(23.2, 16.055685)\\n(20.8, 21.28995)\\n(15.0, 22.594234)\\n(24.1, 24.237297)\\n(18.9, 18.044983)\\n(25.0, 23.913034)\\n(31.6, 30.08595)\\n(16.0, 19.499723)\\n(30.5, 29.572844)\\n(32.4, 36.371693)\\n(23.3, 25.799215)\\n(33.1, 34.768993)\\n(50.0, 35.792217)\\n(19.2, 24.856394)\\n(23.2, 25.645523)\\n(27.5, 25.984446)\\n(23.0, 22.598684)\\n(8.4, 6.4337325)\\n(17.4, 18.203571)\\n(23.8, 27.287323)\\n(22.9, 26.279902)\\n(22.4, 26.119762)\\n(19.6, 20.80202)\\n(31.0, 33.585827)\\n(41.7, 35.79086)\\n(27.9, 19.984297)\\n(24.4, 25.177736)\\n(36.1, 32.635525)\\n(15.3, 22.125525)\\n(16.6, 17.614695)\\n(20.6, 16.167067)\\n(32.0, 34.710358)\\n(24.5, 29.503578)\\n(23.1, 23.611689)\\n(50.0, 39.138878)\\n(12.0, 10.811153)\\n(18.2, 22.05449)\\n(19.4, 25.251263)\\n(11.5, 14.478206)\\n(14.1, 17.056976)\\n(44.8, 35.421284)\\n(17.1, 16.490793)\\n(8.1, 8.521659)\\n(28.0, 29.296915)\\n(15.7, 10.888985)\\n(23.7, 26.735065)\\n(36.2, 29.32128)\\n(19.3, 20.965876)\\n(21.4, 24.669573)\\n(29.1, 30.542236)\\n(16.7, 19.261976)\\n(34.9, 35.719353)\\n(26.4, 26.240265)\\n(20.6, 19.49667)\\n(17.0, 20.14739)\\n(14.5, 16.016228)\\n(33.4, 30.462057)\\n(19.8, 24.134165)\\n(23.0, 21.32455)\\n(15.6, 13.225789)\\n(22.5, 22.223549)\\n(21.2, 19.622852)\\n(20.5, 20.744207)\\n(23.7, 3.3882654)\\n(20.9, 22.019953)\\n(17.9, 4.128317)\\n(10.2, 7.4050703)\\n(24.7, 23.720402)\\n(30.3, 27.53317)\\n(13.8, 15.312617)\\n(13.4, 17.970793)\\n(23.6, 29.395994)\\n(23.1, 15.849891)\\n(22.7, 25.209116)\\n(21.7, 23.63133)\\n(14.1, 16.701466)\\n(27.5, 17.078491)\\n(18.7, 20.47423)\\n(34.6, 32.708126)\\n(14.9, 16.829836)\\n(12.8, 13.306192)\\n(11.8, 13.655031)\\n(25.0, 23.872765)\\n(16.2, 23.310541)\\n(19.2, 22.694164)\\n(29.8, 30.635355)\\n(14.4, 7.3782344)\\n(28.4, 30.953028)\\n(20.6, 21.45385)\\n(25.0, 24.878057)\\n(20.4, 25.309427)\\n(24.1, 22.010025)\\n(7.4, 5.6413217)\\n(10.4, 8.706383)\\n(20.8, 21.826553)\\n(13.8, 4.731912)\\n(42.3, 35.734493)\\n(20.0, 16.86651)\\n(22.2, 25.459717)\\n(50.0, 39.053684)\\n(25.3, 24.921011)\\n(48.5, 39.989147)\\n(23.2, 27.792288)\\n(21.0, 25.073824)\\n(31.7, 31.234335)\\n(23.1, 23.579657)\\n(20.7, 24.60315)\\n(30.1, 30.105541)\\n(17.4, 16.898806)\\n(22.0, 27.23138)\\n(16.5, 8.834927)\\n(50.0, 38.896782)\\n(38.7, 34.61245)\\n(17.4, 14.48393)\\n(11.7, 17.92328)\\n(19.5, 20.161188)\\n(21.4, 19.625153)\\n(18.5, 21.686924)\\n(20.3, 18.298803)\\n(13.0, 16.73246)\\n(15.4, 17.184847)\\n(21.7, 19.897003)\\n(20.1, 23.693676)\\n(21.1, 22.547958)\\n(28.7, 28.387901)\\n(19.0, 16.261417)\\n(30.1, 28.686523)\\n(22.6, 22.953625)\\n(23.4, 21.987255)\\n(16.7, 17.708221)\\n(13.1, 9.578061)\\n(24.3, 21.257206)\\n(20.4, 23.893118)\\n(10.2, 15.795673)\\n(19.3, 23.068611)\\n(33.2, 34.473896)\\n(37.3, 35.539604)\\n(22.6, 27.15035)\\n(21.4, 23.097866)\\n(33.1, 30.733965)\\n(17.5, 20.4168)\\n(8.3, 13.319421)\\n(22.0, 28.468842)\\n(13.1, 18.781738)\\n(16.2, 17.626772)\\n(50.0, 34.7777)\\n(13.4, 14.222331)\\n(21.7, 22.119768)\\n(19.7, 21.482845)\\n(19.6, 22.545338)\\n(17.8, 18.099125)\\n(22.6, 22.127094)\\n(50.0, 42.32223)\\n(17.8, 19.679585)\\n(21.5, 24.10035)\\n(20.1, 17.849052)\\n(18.2, 19.638092)\\n(9.6, 16.704388)\\n(12.3, 12.741325)\\n(22.6, 20.80818)\\n(17.5, 17.902012)\\n(17.8, 19.408892)\\n(25.2, 28.31553)\\n(13.8, 5.670724)\\n(13.9, 9.653565)\\n(20.2, 23.395844)\\n(19.4, 18.001785)\\n(13.3, 14.925332)\\n(23.8, 18.589108)\\n(13.6, 16.911997)\\n(25.0, 27.016575)\\n(13.9, 14.579108)\\n(20.2, 11.221286)\\n(32.5, 33.160004)\\n(23.1, 27.658443)\\n(18.4, 19.213182)\\n(24.4, 23.955322)\\n(13.4, 13.295181)\\n(34.9, 31.169954)\\n(30.1, 36.11217)\\n(22.0, 22.177023)\\n(29.0, 29.008213)\\n(21.7, 23.381973)\\n(13.5, 15.849755)\\n(11.8, 11.134163)\\n(18.8, 20.779728)\\n(34.7, 31.177008)\\n(34.9, 32.722755)\\n(24.3, 27.175228)\\n(28.4, 26.007015)\\n(33.8, 34.234787)\\n(18.9, 25.851765)\\n(28.1, 24.395401)\\n(21.2, 20.281055)\\n(22.8, 29.839706)\\n(18.7, 20.358053)\\n(32.9, 30.45752)\\n(10.2, 14.717082)\\n(39.8, 33.760612)\\n(50.0, 40.880207)\\n(12.7, 10.43512)\\n(20.8, 17.758577)\\n(21.4, 22.383148)\\n(18.6, 10.560929)\\n(27.5, 19.453278)\\n(24.1, 29.971605)\\n(22.8, 31.352282)\\n(14.6, 16.758377)\\n(15.6, 14.295831)\\n(25.1, 29.24299)\\n(22.0, 27.432522)\\n(17.2, 14.304711)\\n(32.7, 30.598507)\\n(13.5, 3.8556304)\\n(31.6, 34.607853)\\n(20.0, 22.458857)\\n(19.4, 22.177628)\\n(22.7, 24.217583)\\n(23.7, 24.7267)\\n(20.6, 26.462029)\\n(29.0, 28.415518)\\n(24.8, 25.104328)\\n(10.5, 12.742228)\\n(9.7, 10.58402)\\n(43.8, 33.266422)\\n(24.7, 28.39137)\\n(22.0, 22.795448)\\n(21.0, 22.548843)\\n(12.7, 15.52652)\\n(24.5, 21.56199)\\n(15.4, 14.483514)\\n(27.0, 29.094429)\\n(18.3, 16.620024)\\n(7.2, 7.800684)\\n(14.3, 16.713205)\\n(19.1, 17.752863)\\n(24.5, 20.94518)\\n(19.5, 18.019035)\\n(7.5, 13.057979)\\n(5.6, 10.51551)\\n(35.2, 34.058807)\\n(32.2, 30.927275)\\n(22.3, 28.034199)\\n(25.0, 14.083366)\\n(29.4, 30.26281)\\n(23.0, 25.7734)\\n(22.4, 21.763283)\\n(46.0, 36.920174)\\n(19.9, 18.031485)\\n(21.1, 24.101954)\\n(21.2, 22.862223)\\n};\\n\\\\addplot+ [mark = {none}, red]coordinates {\\n(5.0, 5.0)\\n(5.6, 5.6)\\n(7.0, 7.0)\\n(7.2, 7.2)\\n(7.4, 7.4)\\n(7.5, 7.5)\\n(8.1, 8.1)\\n(8.3, 8.3)\\n(8.4, 8.4)\\n(8.8, 8.8)\\n(9.6, 9.6)\\n(9.7, 9.7)\\n(10.2, 10.2)\\n(10.4, 10.4)\\n(10.5, 10.5)\\n(10.9, 10.9)\\n(11.5, 11.5)\\n(11.7, 11.7)\\n(11.8, 11.8)\\n(11.9, 11.9)\\n(12.0, 12.0)\\n(12.1, 12.1)\\n(12.3, 12.3)\\n(12.5, 12.5)\\n(12.6, 12.6)\\n(12.7, 12.7)\\n(12.8, 12.8)\\n(13.0, 13.0)\\n(13.1, 13.1)\\n(13.3, 13.3)\\n(13.4, 13.4)\\n(13.5, 13.5)\\n(13.6, 13.6)\\n(13.8, 13.8)\\n(13.9, 13.9)\\n(14.0, 14.0)\\n(14.1, 14.1)\\n(14.2, 14.2)\\n(14.3, 14.3)\\n(14.4, 14.4)\\n(14.5, 14.5)\\n(14.6, 14.6)\\n(14.8, 14.8)\\n(14.9, 14.9)\\n(15.0, 15.0)\\n(15.1, 15.1)\\n(15.2, 15.2)\\n(15.3, 15.3)\\n(15.4, 15.4)\\n(15.6, 15.6)\\n(15.7, 15.7)\\n(16.0, 16.0)\\n(16.2, 16.2)\\n(16.3, 16.3)\\n(16.4, 16.4)\\n(16.5, 16.5)\\n(16.6, 16.6)\\n(16.7, 16.7)\\n(16.8, 16.8)\\n(17.0, 17.0)\\n(17.1, 17.1)\\n(17.2, 17.2)\\n(17.3, 17.3)\\n(17.4, 17.4)\\n(17.5, 17.5)\\n(17.6, 17.6)\\n(17.8, 17.8)\\n(17.9, 17.9)\\n(18.2, 18.2)\\n(18.3, 18.3)\\n(18.4, 18.4)\\n(18.5, 18.5)\\n(18.6, 18.6)\\n(18.7, 18.7)\\n(18.8, 18.8)\\n(18.9, 18.9)\\n(19.0, 19.0)\\n(19.1, 19.1)\\n(19.2, 19.2)\\n(19.3, 19.3)\\n(19.4, 19.4)\\n(19.5, 19.5)\\n(19.6, 19.6)\\n(19.7, 19.7)\\n(19.8, 19.8)\\n(19.9, 19.9)\\n(20.0, 20.0)\\n(20.1, 20.1)\\n(20.2, 20.2)\\n(20.3, 20.3)\\n(20.4, 20.4)\\n(20.5, 20.5)\\n(20.6, 20.6)\\n(20.7, 20.7)\\n(20.8, 20.8)\\n(20.9, 20.9)\\n(21.0, 21.0)\\n(21.1, 21.1)\\n(21.2, 21.2)\\n(21.4, 21.4)\\n(21.5, 21.5)\\n(21.6, 21.6)\\n(21.7, 21.7)\\n(21.8, 21.8)\\n(21.9, 21.9)\\n(22.0, 22.0)\\n(22.2, 22.2)\\n(22.3, 22.3)\\n(22.4, 22.4)\\n(22.5, 22.5)\\n(22.6, 22.6)\\n(22.7, 22.7)\\n(22.8, 22.8)\\n(22.9, 22.9)\\n(23.0, 23.0)\\n(23.1, 23.1)\\n(23.2, 23.2)\\n(23.3, 23.3)\\n(23.4, 23.4)\\n(23.5, 23.5)\\n(23.6, 23.6)\\n(23.7, 23.7)\\n(23.8, 23.8)\\n(23.9, 23.9)\\n(24.0, 24.0)\\n(24.1, 24.1)\\n(24.2, 24.2)\\n(24.3, 24.3)\\n(24.4, 24.4)\\n(24.5, 24.5)\\n(24.7, 24.7)\\n(24.8, 24.8)\\n(25.0, 25.0)\\n(25.1, 25.1)\\n(25.2, 25.2)\\n(25.3, 25.3)\\n(26.4, 26.4)\\n(26.6, 26.6)\\n(27.0, 27.0)\\n(27.5, 27.5)\\n(27.9, 27.9)\\n(28.0, 28.0)\\n(28.1, 28.1)\\n(28.2, 28.2)\\n(28.4, 28.4)\\n(28.7, 28.7)\\n(29.0, 29.0)\\n(29.1, 29.1)\\n(29.4, 29.4)\\n(29.6, 29.6)\\n(29.8, 29.8)\\n(29.9, 29.9)\\n(30.1, 30.1)\\n(30.3, 30.3)\\n(30.5, 30.5)\\n(31.0, 31.0)\\n(31.1, 31.1)\\n(31.5, 31.5)\\n(31.6, 31.6)\\n(31.7, 31.7)\\n(32.0, 32.0)\\n(32.2, 32.2)\\n(32.4, 32.4)\\n(32.5, 32.5)\\n(32.7, 32.7)\\n(32.9, 32.9)\\n(33.0, 33.0)\\n(33.1, 33.1)\\n(33.2, 33.2)\\n(33.4, 33.4)\\n(33.8, 33.8)\\n(34.6, 34.6)\\n(34.7, 34.7)\\n(34.9, 34.9)\\n(35.1, 35.1)\\n(35.2, 35.2)\\n(36.0, 36.0)\\n(36.1, 36.1)\\n(36.2, 36.2)\\n(36.4, 36.4)\\n(36.5, 36.5)\\n(37.2, 37.2)\\n(37.3, 37.3)\\n(38.7, 38.7)\\n(39.8, 39.8)\\n(41.7, 41.7)\\n(42.3, 42.3)\\n(42.8, 42.8)\\n(43.1, 43.1)\\n(43.8, 43.8)\\n(44.8, 44.8)\\n(45.4, 45.4)\\n(46.0, 46.0)\\n(48.3, 48.3)\\n(48.5, 48.5)\\n(48.8, 48.8)\\n(50.0, 50.0)\\n};\\n\\\\end{axis}\\n\", \"\", \"\\\\usepackage{pgfplots}\\n\\\\pgfplotsset{compat=newest}\\n\\\\pgfplotsset{every axis legend/.append style={%\\ncells={anchor=west}}\\n}\\n\\\\usepgfplotslibrary{polar}\\n\\\\usetikzlibrary{arrows}\\n\\\\tikzset{>=stealth'}\\n\", true, true)" - ] - }, - "execution_count": 19, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# Plot true values versus predicted values for multilayer perceptron on training set\n", - "knetmlpreg_plot_training = PredictMD.plotsinglelabelregressiontrueversuspredicted(\n", - " knetmlpreg,\n", - " training_features_df,\n", - " traininglabels_df,\n", - " labelname,\n", - " )" - ] - }, - { - "cell_type": "code", - "execution_count": 20, - "metadata": {}, - "outputs": [ - { - "data": { - "image/svg+xml": [ - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - " \n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - " \n", - " \n", - "\n", - "\n", - " \n", - " \n", - "\n", - "\n", - " \n", - " \n", - "\n", - "\n", - " \n", - " \n", - "\n", - "\n", - " \n", - " \n", - "\n", - "\n", - " \n", - "\n", - "\n", - " \n", - " \n", - "\n", - "\n", - " \n", - " \n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - " \n", - "\n", - "\n", - " \n", - " \n", - " \n", - "\n", - "\n", - " \n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "\n", - "\n", - " \n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - "\n", - "\n", - "\n", - "\n" - ], - "text/plain": [ - "TikzPictures.TikzPicture(\"\\\\begin{axis}[ylabel = {Predicted value}, xlabel = {True value}]\\\\addplot+ [only marks = {true}, black,fill=black]coordinates {\\n(10.4, 11.378595)\\n(50.0, 40.847496)\\n(50.0, 26.071873)\\n(15.2, 17.63693)\\n(26.2, 24.580637)\\n(5.0, 8.181544)\\n(7.0, -0.22887884)\\n(27.1, 19.20275)\\n(35.4, 32.92595)\\n(17.7, 20.031631)\\n(8.8, 5.695444)\\n(9.5, 13.616158)\\n(31.2, 29.646656)\\n(50.0, 22.914944)\\n(8.4, 14.306611)\\n(14.6, 10.430814)\\n(19.7, 15.024801)\\n(23.8, 11.749804)\\n(20.0, 19.1745)\\n(21.2, 23.244884)\\n(30.7, 29.092201)\\n(15.6, 7.6691175)\\n(50.0, 24.507969)\\n(17.2, 15.810163)\\n(19.6, 20.655064)\\n(17.2, 11.76145)\\n(18.1, 18.99049)\\n(25.0, 29.543112)\\n(22.2, 26.809679)\\n(30.8, 33.212276)\\n(37.9, 34.8288)\\n(22.8, 24.433311)\\n(20.4, 21.377625)\\n(15.2, 15.810524)\\n(41.3, 30.992485)\\n(12.7, 17.816906)\\n(20.1, 23.074581)\\n(36.2, 28.750443)\\n(21.9, 26.529856)\\n(16.6, 17.527037)\\n(24.4, 22.065762)\\n(8.3, 11.990634)\\n(35.4, 32.66175)\\n(18.0, 20.947994)\\n(23.2, 19.771046)\\n(24.7, 24.685009)\\n(22.9, 29.045984)\\n(28.5, 35.157745)\\n(8.5, 7.128856)\\n(25.0, 26.075535)\\n(24.6, 30.36175)\\n(13.1, 13.141035)\\n(14.3, 18.135525)\\n(7.2, 15.987552)\\n(28.6, 27.139654)\\n(23.4, 24.909983)\\n(14.9, 17.203114)\\n(10.9, 15.563125)\\n(11.0, 14.333907)\\n(8.5, 15.75454)\\n(28.7, 31.341053)\\n(20.0, 20.200773)\\n(23.9, 24.380354)\\n(15.6, 10.3846445)\\n(43.5, 37.9212)\\n(44.0, 36.581528)\\n(22.1, 29.059736)\\n(20.5, 25.247442)\\n(24.8, 30.308702)\\n(22.3, 21.196886)\\n(15.0, 17.289505)\\n(13.3, 13.459992)\\n(11.9, 27.101614)\\n(22.0, 29.75741)\\n(18.8, 20.447477)\\n(18.7, 24.827854)\\n(23.1, 26.951015)\\n(20.0, 20.460764)\\n(21.7, 21.347935)\\n(37.6, 35.837646)\\n(37.0, 28.389446)\\n(24.8, 27.304829)\\n(26.7, 31.700914)\\n(19.3, 16.06736)\\n(16.1, 17.58747)\\n(20.3, 25.841993)\\n(21.0, 22.143562)\\n(10.8, 12.785152)\\n(50.0, 33.98431)\\n(20.6, 27.633331)\\n(22.2, 20.572609)\\n(14.4, 3.599931)\\n(23.6, 32.047756)\\n(18.9, 22.803837)\\n(22.0, 25.500452)\\n(26.6, 26.948343)\\n(20.6, 24.977243)\\n(23.9, 31.349258)\\n(24.6, 26.194775)\\n(6.3, 9.021185)\\n(21.7, 19.729734)\\n(11.7, 15.181178)\\n(33.4, 35.420307)\\n(24.3, 23.352882)\\n(13.1, 14.071503)\\n(26.5, 27.21153)\\n(20.9, 23.834042)\\n(50.0, 34.93527)\\n(23.9, 28.47702)\\n(8.7, 11.496721)\\n(13.2, -4.0202856)\\n(18.3, 21.139679)\\n(21.8, 19.339014)\\n(18.6, 20.21507)\\n(19.5, 23.475376)\\n(24.0, 32.187717)\\n(22.5, 27.429321)\\n(26.4, 23.997818)\\n(16.8, 20.756046)\\n(33.2, 31.41387)\\n(21.9, 31.842075)\\n(23.9, 29.58745)\\n(46.7, 31.630226)\\n(17.1, 21.320086)\\n(16.5, 23.046455)\\n(25.0, 28.369534)\\n(20.7, 25.832901)\\n(19.8, 19.806576)\\n(22.8, 26.49275)\\n(18.4, 21.814486)\\n(18.5, 25.259613)\\n(31.5, 29.836462)\\n(13.3, 16.85742)\\n(19.9, 23.462389)\\n(22.2, 22.207108)\\n(16.1, 19.993494)\\n(17.8, 18.018778)\\n(27.1, 26.21727)\\n(23.1, 25.559635)\\n(19.1, 21.102829)\\n(33.3, 34.994724)\\n(11.3, 13.176736)\\n(28.7, 27.874817)\\n(19.4, 20.460287)\\n(19.9, 16.27915)\\n(13.8, 1.7924522)\\n(19.4, 26.370302)\\n(20.3, 22.311266)\\n(13.8, 19.93773)\\n(16.1, 17.072842)\\n(18.9, 18.644205)\\n(19.0, 24.157078)\\n};\\n\\\\addplot+ [mark = {none}, red]coordinates {\\n(5.0, 5.0)\\n(6.3, 6.3)\\n(7.0, 7.0)\\n(7.2, 7.2)\\n(8.3, 8.3)\\n(8.4, 8.4)\\n(8.5, 8.5)\\n(8.7, 8.7)\\n(8.8, 8.8)\\n(9.5, 9.5)\\n(10.4, 10.4)\\n(10.8, 10.8)\\n(10.9, 10.9)\\n(11.0, 11.0)\\n(11.3, 11.3)\\n(11.7, 11.7)\\n(11.9, 11.9)\\n(12.7, 12.7)\\n(13.1, 13.1)\\n(13.2, 13.2)\\n(13.3, 13.3)\\n(13.8, 13.8)\\n(14.3, 14.3)\\n(14.4, 14.4)\\n(14.6, 14.6)\\n(14.9, 14.9)\\n(15.0, 15.0)\\n(15.2, 15.2)\\n(15.6, 15.6)\\n(16.1, 16.1)\\n(16.5, 16.5)\\n(16.6, 16.6)\\n(16.8, 16.8)\\n(17.1, 17.1)\\n(17.2, 17.2)\\n(17.7, 17.7)\\n(17.8, 17.8)\\n(18.0, 18.0)\\n(18.1, 18.1)\\n(18.3, 18.3)\\n(18.4, 18.4)\\n(18.5, 18.5)\\n(18.6, 18.6)\\n(18.7, 18.7)\\n(18.8, 18.8)\\n(18.9, 18.9)\\n(19.0, 19.0)\\n(19.1, 19.1)\\n(19.3, 19.3)\\n(19.4, 19.4)\\n(19.5, 19.5)\\n(19.6, 19.6)\\n(19.7, 19.7)\\n(19.8, 19.8)\\n(19.9, 19.9)\\n(20.0, 20.0)\\n(20.1, 20.1)\\n(20.3, 20.3)\\n(20.4, 20.4)\\n(20.5, 20.5)\\n(20.6, 20.6)\\n(20.7, 20.7)\\n(20.9, 20.9)\\n(21.0, 21.0)\\n(21.2, 21.2)\\n(21.7, 21.7)\\n(21.8, 21.8)\\n(21.9, 21.9)\\n(22.0, 22.0)\\n(22.1, 22.1)\\n(22.2, 22.2)\\n(22.3, 22.3)\\n(22.5, 22.5)\\n(22.8, 22.8)\\n(22.9, 22.9)\\n(23.1, 23.1)\\n(23.2, 23.2)\\n(23.4, 23.4)\\n(23.6, 23.6)\\n(23.8, 23.8)\\n(23.9, 23.9)\\n(24.0, 24.0)\\n(24.3, 24.3)\\n(24.4, 24.4)\\n(24.6, 24.6)\\n(24.7, 24.7)\\n(24.8, 24.8)\\n(25.0, 25.0)\\n(26.2, 26.2)\\n(26.4, 26.4)\\n(26.5, 26.5)\\n(26.6, 26.6)\\n(26.7, 26.7)\\n(27.1, 27.1)\\n(28.5, 28.5)\\n(28.6, 28.6)\\n(28.7, 28.7)\\n(30.7, 30.7)\\n(30.8, 30.8)\\n(31.2, 31.2)\\n(31.5, 31.5)\\n(33.2, 33.2)\\n(33.3, 33.3)\\n(33.4, 33.4)\\n(35.4, 35.4)\\n(36.2, 36.2)\\n(37.0, 37.0)\\n(37.6, 37.6)\\n(37.9, 37.9)\\n(41.3, 41.3)\\n(43.5, 43.5)\\n(44.0, 44.0)\\n(46.7, 46.7)\\n(50.0, 50.0)\\n};\\n\\\\end{axis}\\n\", \"\", \"\\\\usepackage{pgfplots}\\n\\\\pgfplotsset{compat=newest}\\n\\\\pgfplotsset{every axis legend/.append style={%\\ncells={anchor=west}}\\n}\\n\\\\usepgfplotslibrary{polar}\\n\\\\usetikzlibrary{arrows}\\n\\\\tikzset{>=stealth'}\\n\", true, true)" - ] - }, - "execution_count": 20, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# Plot true values versus predicted values for multilayer perceptron on testing set\n", - "knetmlpreg_plot_testing = PredictMD.plotsinglelabelregressiontrueversuspredicted(\n", - " knetmlpreg,\n", - " testing_features_df,\n", - " testing_labels_df,\n", - " labelname,\n", - " )" - ] - }, - { - "cell_type": "code", - "execution_count": 21, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
metricKnet MLP
1R^2 (coefficient of determination)0.731002
" - ], - "text/plain": [ - "1×2 DataFrames.DataFrame\n", - "│ Row │ metric │ Knet MLP │\n", - "├─────┼────────────────────────────────────┼──────────┤\n", - "│ 1 │ R^2 (coefficient of determination) │ 0.731002 │" - ] - }, - "execution_count": 21, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# Evaluate performance of multilayer perceptron on training set\n", - "PredictMD.singlelabelregressionmetrics(\n", - " knetmlpreg,\n", - " training_features_df,\n", - " traininglabels_df,\n", - " labelname,\n", - " )" - ] - }, - { - "cell_type": "code", - "execution_count": 22, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
metricKnet MLP
1R^2 (coefficient of determination)0.553721
" - ], - "text/plain": [ - "1×2 DataFrames.DataFrame\n", - "│ Row │ metric │ Knet MLP │\n", - "├─────┼────────────────────────────────────┼──────────┤\n", - "│ 1 │ R^2 (coefficient of determination) │ 0.553721 │" - ] - }, - "execution_count": 22, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# Evaluate performance of multilayer perceptron on testing set\n", - "PredictMD.singlelabelregressionmetrics(\n", - " knetmlpreg,\n", - " testing_features_df,\n", - " testing_labels_df,\n", - " labelname,\n", - " )" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Section 4: Save trained models to file (if desired) " - ] - }, - { - "cell_type": "code", - "execution_count": 23, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "\u001b[1m\u001b[36mINFO: \u001b[39m\u001b[22m\u001b[36mSaved model to file ./knetmlpreg.jld2\n", - "\u001b[39m" - ] - } - ], - "source": [ - "if save_trained\n", - " PredictMD.save(knetmlpreg_filename, knetmlpreg)\n", - "end" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Appendix A: Directly access the output of regression models" - ] - }, - { - "cell_type": "code", - "execution_count": 24, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
MedV
113.6334
214.028
317.662
431.4771
517.89
625.3657
716.8383
814.0331
912.5159
1011.4709
1126.5197
1218.3666
1324.5346
1422.6771
1510.1408
1616.4835
1731.2327
1826.9702
1924.6005
2034.0128
2123.1547
2227.8028
2335.429
2419.8076
2519.495
2612.5427
2719.3046
2814.6291
2924.6961
3033.5011
" - ], - "text/plain": [ - "354×1 DataFrames.DataFrame\n", - "│ Row │ MedV │\n", - "├─────┼─────────┤\n", - "│ 1 │ 13.6334 │\n", - "│ 2 │ 14.028 │\n", - "│ 3 │ 17.662 │\n", - "│ 4 │ 31.4771 │\n", - "│ 5 │ 17.89 │\n", - "│ 6 │ 25.3657 │\n", - "│ 7 │ 16.8383 │\n", - "│ 8 │ 14.0331 │\n", - "│ 9 │ 12.5159 │\n", - "│ 10 │ 11.4709 │\n", - "│ 11 │ 26.5197 │\n", - "⋮\n", - "│ 343 │ 10.5155 │\n", - "│ 344 │ 34.0588 │\n", - "│ 345 │ 30.9273 │\n", - "│ 346 │ 28.0342 │\n", - "│ 347 │ 14.0834 │\n", - "│ 348 │ 30.2628 │\n", - "│ 349 │ 25.7734 │\n", - "│ 350 │ 21.7633 │\n", - "│ 351 │ 36.9202 │\n", - "│ 352 │ 18.0315 │\n", - "│ 353 │ 24.102 │\n", - "│ 354 │ 22.8622 │" - ] - }, - "execution_count": 24, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# We can use the PredictMD.predict() function to get the real-valued predictions\n", - "# output by each of regression models.\n", - "\n", - "# Get real-valued predictions from each model for training set\n", - "PredictMD.predict(knetmlpreg,training_features_df)" - ] - }, - { - "cell_type": "code", - "execution_count": 25, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
MedV
111.3786
240.8475
326.0719
417.6369
524.5806
68.18154
7-0.228879
819.2027
932.9259
1020.0316
115.69544
1213.6162
1329.6467
1422.9149
1514.3066
1610.4308
1715.0248
1811.7498
1919.1745
2023.2449
2129.0922
227.66912
2324.508
2415.8102
2520.6551
2611.7615
2718.9905
2829.5431
2926.8097
3033.2123
" - ], - "text/plain": [ - "152×1 DataFrames.DataFrame\n", - "│ Row │ MedV │\n", - "├─────┼───────────┤\n", - "│ 1 │ 11.3786 │\n", - "│ 2 │ 40.8475 │\n", - "│ 3 │ 26.0719 │\n", - "│ 4 │ 17.6369 │\n", - "│ 5 │ 24.5806 │\n", - "│ 6 │ 8.18154 │\n", - "│ 7 │ -0.228879 │\n", - "│ 8 │ 19.2027 │\n", - "│ 9 │ 32.9259 │\n", - "│ 10 │ 20.0316 │\n", - "│ 11 │ 5.69544 │\n", - "⋮\n", - "│ 141 │ 34.9947 │\n", - "│ 142 │ 13.1767 │\n", - "│ 143 │ 27.8748 │\n", - "│ 144 │ 20.4603 │\n", - "│ 145 │ 16.2792 │\n", - "│ 146 │ 1.79245 │\n", - "│ 147 │ 26.3703 │\n", - "│ 148 │ 22.3113 │\n", - "│ 149 │ 19.9377 │\n", - "│ 150 │ 17.0728 │\n", - "│ 151 │ 18.6442 │\n", - "│ 152 │ 24.1571 │" - ] - }, - "execution_count": 25, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# Get real-valued predictions from each model for testing set\n", - "PredictMD.predict(knetmlpreg,testing_features_df)" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Julia 0.6.2", - "language": "julia", - "name": "julia-0.6" - }, - "language_info": { - "file_extension": ".jl", - "mimetype": "application/julia", - "name": "julia", - "version": "0.6.2" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/examples_old/boston_housing/boston_housing_mlp.jl b/examples_old/boston_housing/boston_housing_mlp.jl deleted file mode 100644 index af22d212f..000000000 --- a/examples_old/boston_housing/boston_housing_mlp.jl +++ /dev/null @@ -1,263 +0,0 @@ - -# import required packages -import PredictMD -import CSV -import DataFrames -import GZip -import Knet -import StatsBase - -# set the seed of the global random number generator -# this makes the results reproducible -srand(999) - -load_pretrained = false -save_trained = true - -# load_trained = true -# save_trained = false - -knetmlpreg_filename = "./knetmlpreg.jld2" - -# Import Boston housing data -df = CSV.read( - GZip.gzopen(joinpath(Pkg.dir("RDatasets"),"data","MASS","Boston.csv.gz")), - DataFrames.DataFrame, - ) - -#take a quick look at file header and few rows -DataFrames.head(df) - -# Remove rows with missing data -DataFrames.dropmissing!(df) - -# Shuffle rows -PredictMD.shuffle_rows!(df) - -# Define labels -categoricalfeaturenames = Symbol[] - -continuousfeaturenames = Symbol[ - :Crim, - :Zn, - :Indus, - :Chas, - :NOx, - :Rm, - :Age, - :Dis, - :Rad, - :Tax, - :PTRatio, - :Black, - :LStat, - ] -featurenames = vcat(categoricalfeaturenames, continuousfeaturenames) - -if load_pretrained == "true" -else - contrasts = PredictMD.contrasts(df, featurenames) -end - -# Define labels -labelname = :MedV - -# Put features and labels in separate dataframes -features_df = df[featurenames] -labels_df = df[[labelname]] - -# Display for exploration -display(DataFrames.head(features_df)) -display(DataFrames.head(labels_df)) - -# View summary statistics for label variable (mean, quartiles, etc.) -DataFrames.describe(labels_df[labelname]) - -# Split data into training set (70%) and testing set (30%) -training_features_df,testing_features_df,traininglabels_df,testing_labels_df = - PredictMD.split_data(features_df,labels_df,0.7); - -# Define predict function -function knetmlp_predict( - w, # don't put a type annotation on this - x0::AbstractArray; - training::Bool = false, - ) - # x0 = input layer - # x1 = hidden layer - x1 = Knet.relu.( w[1]*x0 .+ w[2] ) # w[1] = weights, w[2] = biases - # x2 = output layer - x2 = w[3]*x1 .+ w[4] # w[3] = weights, w[4] = biases - return x2 -end - -if load_pretrained - # No need to initialize weights since we are going to load them from file - knetmlp_modelweights = Any[] -else - # Randomly initialize model weights - knetmlp_modelweights = Any[ - # input layer has dimension contrasts.num_array_columns - # - # hidden layer (10 neurons): - Cfloat.( - 0.1f0*randn(Cfloat,10,contrasts.num_array_columns) # weights - ), - Cfloat.( - zeros(Cfloat,10,1) # biases - ), - # - # output layer (regression nets have exactly 1 neuron in output layer): - Cfloat.( - 0.1f0*randn(Cfloat,1,10) # weights - ), - Cfloat.( - zeros(Cfloat,1,1) # biases - ), - ] -end - -# Define loss function -function knetmlp_loss( - predict::Function, - modelweights, # don't put a type annotation on this - x::AbstractArray, - ytrue::AbstractArray; - L1::Real = Cfloat(0), - L2::Real = Cfloat(0), - ) - loss = mean( - abs2, - ytrue - predict(modelweights, x), - ) - if L1 != 0 - loss += L1 * sum(sum(abs, w_i) for w_i in modelweights[1:2:end]) - end - if L2 != 0 - loss += L2 * sum(sum(abs2, w_i) for w_i in modelweights[1:2:end]) - end - return loss -end - -# Define loss hyperparameters -knetmlp_losshyperparameters = Dict() -knetmlp_losshyperparameters[:L1] = Cfloat(0.0) -knetmlp_losshyperparameters[:L2] = Cfloat(0.0) - -# Select optimization algorithm -knetmlp_optimizationalgorithm = :Adam - -# Set optimization hyperparameters -knetmlp_optimizerhyperparameters = Dict() - -# Set the minibatch size -knetmlp_minibatchsize = 48 - -# Set the max number of epochs. After training, look at the learning curve. If -# it looks like the model has not yet converged, raise maxepochs. If it looks -# like the loss has hit a plateau and you are worried about overfitting, lower -# maxepochs. -knetmlp_maxepochs = 500 - -# Set up multilayer perceptron model -knetmlpreg = PredictMD.singlelabeldataframeknetregression( - featurenames, - labelname; - package = :Knetjl, - name = "Knet MLP", - predict = knetmlp_predict, - loss = knetmlp_loss, - losshyperparameters = knetmlp_losshyperparameters, - optimizationalgorithm = knetmlp_optimizationalgorithm, - optimizerhyperparameters = knetmlp_optimizerhyperparameters, - minibatchsize = knetmlp_minibatchsize, - modelweights = knetmlp_modelweights, - maxepochs = knetmlp_maxepochs, - printlosseverynepochs = 100, # if 0, will not print at all - ) - -if load_pretrained == "true" - PredictMD.load!(knetmlpreg_filename, knetmlpreg) -else - # set feature contrasts - PredictMD.set_feature_contrasts!(knetmlpreg , feature_contrasts) - # Train multilayer perceptron model on training set - PredictMD.fit!(knetmlpreg,training_features_df,traininglabels_df,) -end - -# Plot learning curve: loss vs. epoch -knet_learningcurve_lossvsepoch = PredictMD.plotlearningcurve( - knetmlpreg, - :lossvsepoch; - ) - -# Plot learning curve: loss vs. epoch, skip the first 10 epochs -knet_learningcurve_lossvsepoch_skip10epochs = PredictMD.plotlearningcurve( - knetmlpreg, - :lossvsepoch; - startat = 10, - endat = :end, - ) - -# Plot learning curve: loss vs. iteration -knet_learningcurve_lossvsiteration = PredictMD.plotlearningcurve( - knetmlpreg, - :lossvsiteration; - window = 50, - sampleevery = 10, - ) - -# Plot learning curve: loss vs. iteration, skip the first 100 iterations -knet_learningcurve_lossvsiteration_skip100iterations = PredictMD.plotlearningcurve( - knetmlpreg, - :lossvsiteration; - window = 50, - sampleevery = 10, - startat = 100, - endat = :end, - ) - -# Plot true values versus predicted values for multilayer perceptron on training set -knetmlpreg_plot_training = PredictMD.plotsinglelabelregressiontrueversuspredicted( - knetmlpreg, - training_features_df, - traininglabels_df, - labelname, - ) - -# Plot true values versus predicted values for multilayer perceptron on testing set -knetmlpreg_plot_testing = PredictMD.plotsinglelabelregressiontrueversuspredicted( - knetmlpreg, - testing_features_df, - testing_labels_df, - labelname, - ) - -# Evaluate performance of multilayer perceptron on training set -PredictMD.singlelabelregressionmetrics( - knetmlpreg, - training_features_df, - traininglabels_df, - labelname, - ) - -# Evaluate performance of multilayer perceptron on testing set -PredictMD.singlelabelregressionmetrics( - knetmlpreg, - testing_features_df, - testing_labels_df, - labelname, - ) - -if save_trained - PredictMD.save(knetmlpreg_filename, knetmlpreg) -end - -# We can use the PredictMD.predict() function to get the real-valued predictions -# output by each of regression models. - -# Get real-valued predictions from each model for training set -PredictMD.predict(knetmlpreg,training_features_df) - -# Get real-valued predictions from each model for testing set -PredictMD.predict(knetmlpreg,testing_features_df) diff --git a/examples_old/boston_housing/boston_housing_random_forest.ipynb b/examples_old/boston_housing/boston_housing_random_forest.ipynb deleted file mode 100644 index 6e4c2f1d0..000000000 --- a/examples_old/boston_housing/boston_housing_random_forest.ipynb +++ /dev/null @@ -1,1534 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Single-Label Random Forest Regression" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Section 1: Setup " - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "MersenneTwister(UInt32[0x000003e7], Base.dSFMT.DSFMT_state(Int32[-412893719, 1072748155, -748568654, 1073610384, -1271302057, 1073556021, -429186579, 1073162675, 932796209, 1073458022 … 1115928124, 1073598513, 1280798571, 1072732908, -581554620, 1977796709, 1774936613, -1100988421, 382, 0]), [1.42493, 1.73642, 1.96773, 1.25616, 1.79498, 1.08418, 1.03251, 1.24742, 1.37342, 1.61376 … 1.90542, 1.22308, 1.11159, 1.10392, 1.64138, 1.6187, 1.71929, 1.0499, 1.28179, 1.02264], 382)" - ] - }, - "execution_count": 1, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# import required packages\n", - "import PredictMD\n", - "import CSV\n", - "import DataFrames\n", - "import GZip\n", - "import StatsBase\n", - "\n", - "# set the seed of the global random number generator\n", - "# this makes the results reproducible\n", - "srand(999)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Running for the first time\n", - "\n", - "If you are running this file for the first time and/or if you do not have\n", - "any trained models saved to disk, uncomment the lines below to train a model and save it to disk" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "true" - ] - }, - "execution_count": 2, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "load_pretrained = false\n", - "save_trained = true" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Using a pre-trained model\n", - "\n", - "If you already have trained models saved, and you would like to load those. Uncomment the lines below" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [], - "source": [ - "# load_pretrained = true\n", - "# save_trained = false" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Set your paths" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [], - "source": [ - "randomforestreg_filename = \"./randomforestreg.jld2\";" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Section 2: Prepare data " - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
CrimZnIndusChasNOxRmAgeDisRadTaxPTRatioBlackLStatMedV
10.0063218.02.3100.5386.57565.24.09129615.3396.94.9824.0
20.027310.07.0700.4696.42178.94.9671224217.8396.99.1421.6
30.027290.07.0700.4697.18561.14.9671224217.8392.834.0334.7
40.032370.02.1800.4586.99845.86.0622322218.7394.632.9433.4
50.069050.02.1800.4587.14754.26.0622322218.7396.95.3336.2
60.029850.02.1800.4586.4358.76.0622322218.7394.125.2128.7
" - ], - "text/plain": [ - "6×14 DataFrames.DataFrame. Omitted printing of 5 columns\n", - "│ Row │ Crim │ Zn │ Indus │ Chas │ NOx │ Rm │ Age │ Dis │ Rad │\n", - "├─────┼─────────┼──────┼───────┼──────┼───────┼───────┼──────┼────────┼─────┤\n", - "│ 1 │ 0.00632 │ 18.0 │ 2.31 │ 0 │ 0.538 │ 6.575 │ 65.2 │ 4.09 │ 1 │\n", - "│ 2 │ 0.02731 │ 0.0 │ 7.07 │ 0 │ 0.469 │ 6.421 │ 78.9 │ 4.9671 │ 2 │\n", - "│ 3 │ 0.02729 │ 0.0 │ 7.07 │ 0 │ 0.469 │ 7.185 │ 61.1 │ 4.9671 │ 2 │\n", - "│ 4 │ 0.03237 │ 0.0 │ 2.18 │ 0 │ 0.458 │ 6.998 │ 45.8 │ 6.0622 │ 3 │\n", - "│ 5 │ 0.06905 │ 0.0 │ 2.18 │ 0 │ 0.458 │ 7.147 │ 54.2 │ 6.0622 │ 3 │\n", - "│ 6 │ 0.02985 │ 0.0 │ 2.18 │ 0 │ 0.458 │ 6.43 │ 58.7 │ 6.0622 │ 3 │" - ] - }, - "execution_count": 5, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# Import Boston housing data\n", - "df = CSV.read(\n", - " GZip.gzopen(joinpath(Pkg.dir(\"RDatasets\"),\"data\",\"MASS\",\"Boston.csv.gz\")),\n", - " DataFrames.DataFrame,\n", - " )\n", - "\n", - "#take a quick look at file header and few rows\n", - "DataFrames.head(df)" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "13-element Array{Symbol,1}:\n", - " :Crim \n", - " :Zn \n", - " :Indus \n", - " :Chas \n", - " :NOx \n", - " :Rm \n", - " :Age \n", - " :Dis \n", - " :Rad \n", - " :Tax \n", - " :PTRatio\n", - " :Black \n", - " :LStat " - ] - }, - "execution_count": 6, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# Remove rows with missing data\n", - "DataFrames.dropmissing!(df)\n", - "\n", - "# Shuffle rows\n", - "PredictMD.shuffle_rows!(df)\n", - "\n", - "# Define labels\n", - "categoricalfeaturenames = Symbol[]\n", - "\n", - "continuousfeaturenames = Symbol[\n", - " :Crim,\n", - " :Zn,\n", - " :Indus,\n", - " :Chas,\n", - " :NOx,\n", - " :Rm,\n", - " :Age,\n", - " :Dis,\n", - " :Rad,\n", - " :Tax,\n", - " :PTRatio,\n", - " :Black,\n", - " :LStat,\n", - " ]\n", - "featurenames = vcat(categoricalfeaturenames, continuousfeaturenames)" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "PredictMD.ImmutableDataFrameFeatureContrasts(Symbol[:Crim, :Zn, :Indus, :Chas, :NOx, :Rm, :Age, :Dis, :Rad, :Tax, :PTRatio, :Black, :LStat], 13, Dict{Symbol,StatsModels.ContrastsMatrix}(), 13)" - ] - }, - "execution_count": 7, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "if load_pretrained\n", - "else\n", - " contrasts = PredictMD.contrasts(df, featurenames)\n", - "end" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
CrimZnIndusChasNOxRmAgeDisRadTaxPTRatioBlackLStat
188.97620.018.100.6716.96891.91.41652466620.2396.917.21
20.060470.02.4600.4886.15368.83.2797319317.8387.1113.15
30.17120.08.5600.525.83691.92.211538420.9395.6718.66
40.544520.021.8900.6246.15197.91.6687443721.2396.918.46
50.0346635.06.0600.43796.03123.36.6407130416.9362.257.83
60.0200995.02.6800.41618.03431.95.118422414.7390.552.88
" - ], - "text/plain": [ - "6×13 DataFrames.DataFrame. Omitted printing of 4 columns\n", - "│ Row │ Crim │ Zn │ Indus │ Chas │ NOx │ Rm │ Age │ Dis │ Rad │\n", - "├─────┼─────────┼──────┼───────┼──────┼────────┼───────┼──────┼────────┼─────┤\n", - "│ 1 │ 88.9762 │ 0.0 │ 18.1 │ 0 │ 0.671 │ 6.968 │ 91.9 │ 1.4165 │ 24 │\n", - "│ 2 │ 0.06047 │ 0.0 │ 2.46 │ 0 │ 0.488 │ 6.153 │ 68.8 │ 3.2797 │ 3 │\n", - "│ 3 │ 0.1712 │ 0.0 │ 8.56 │ 0 │ 0.52 │ 5.836 │ 91.9 │ 2.211 │ 5 │\n", - "│ 4 │ 0.54452 │ 0.0 │ 21.89 │ 0 │ 0.624 │ 6.151 │ 97.9 │ 1.6687 │ 4 │\n", - "│ 5 │ 0.03466 │ 35.0 │ 6.06 │ 0 │ 0.4379 │ 6.031 │ 23.3 │ 6.6407 │ 1 │\n", - "│ 6 │ 0.02009 │ 95.0 │ 2.68 │ 0 │ 0.4161 │ 8.034 │ 31.9 │ 5.118 │ 4 │" - ] - }, - "metadata": {}, - "output_type": "display_data", - "source": "julia" - }, - { - "data": { - "text/html": [ - "
MedV
110.4
229.6
319.5
417.8
519.4
650.0
" - ], - "text/plain": [ - "6×1 DataFrames.DataFrame\n", - "│ Row │ MedV │\n", - "├─────┼──────┤\n", - "│ 1 │ 10.4 │\n", - "│ 2 │ 29.6 │\n", - "│ 3 │ 19.5 │\n", - "│ 4 │ 17.8 │\n", - "│ 5 │ 19.4 │\n", - "│ 6 │ 50.0 │" - ] - }, - "metadata": {}, - "output_type": "display_data", - "source": "julia" - } - ], - "source": [ - "# Define labels\n", - "labelname = :MedV\n", - "\n", - "# Put features and labels in separate dataframes\n", - "features_df = df[featurenames]\n", - "labels_df = df[[labelname]]\n", - "\n", - "# Display for exploration\n", - "display(DataFrames.head(features_df))\n", - "display(DataFrames.head(labels_df))" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Summary Stats:\n", - "Mean: 22.532806\n", - "Minimum: 5.000000\n", - "1st Quartile: 17.025000\n", - "Median: 21.200000\n", - "3rd Quartile: 25.000000\n", - "Maximum: 50.000000\n", - "Length: 506\n", - "Type: Union{Float64, Missings.Missing}\n", - "Number Missing: 0\n", - "% Missing: 0.000000\n" - ] - } - ], - "source": [ - "# View summary statistics for label variable (mean, quartiles, etc.)\n", - "DataFrames.describe(labels_df[labelname])" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": {}, - "outputs": [], - "source": [ - "# Split data into training set (70%) and testing set (30%)\n", - "training_features_df,testing_features_df,traininglabels_df,testing_labels_df =\n", - " PredictMD.split_data(features_df,labels_df;training = 0.7,testing = 0.3,);" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Section 3: Set up and train models " - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Random forest regression" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "\u001b[1m\u001b[36mINFO: \u001b[39m\u001b[22m\u001b[36mStarting to train DecisionTree.jl model.\n", - "\u001b[39m\u001b[1m\u001b[36mINFO: \u001b[39m\u001b[22m\u001b[36mFinished training DecisionTree.jl model.\n", - "\u001b[39m" - ] - }, - { - "data": { - "text/plain": [ - "PredictMD.DecisionTreeModel(\"Random forest\", false, true, :MedV, Any[], Dict{Any,Any}(Pair{Any,Any}(:nsubfeatures, 2),Pair{Any,Any}(:ntrees, 20)), Ensemble of Decision Trees\n", - "Trees: 20\n", - "Avg Leaves: 145.0\n", - "Avg Depth: 9.75)" - ] - }, - "execution_count": 11, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# Set up random forest regression model\n", - "randomforestreg = PredictMD.singlelabeldataframerandomforestregression(\n", - " featurenames,\n", - " labelname;\n", - " nsubfeatures = 2, # number of subfeatures; defaults to 2\n", - " ntrees = 20, # number of trees; defaults to 10\n", - " package = :DecisionTreejl,\n", - " name = \"Random forest\" # optional\n", - " )\n", - "\n", - "if load_pretrained\n", - " PredictMD.load!(randomforestreg_filename, randomforestreg)\n", - "else\n", - " # set feature contrasts\n", - " PredictMD.set_feature_contrasts!(randomforestreg, contrasts)\n", - " # Train random forest model on training set\n", - " PredictMD.fit!(randomforestreg,training_features_df,traininglabels_df,)\n", - "end" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "metadata": {}, - "outputs": [ - { - "data": { - "image/svg+xml": [ - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - " \n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - " \n", - " \n", - "\n", - "\n", - " \n", - " \n", - "\n", - "\n", - " \n", - " \n", - "\n", - "\n", - " \n", - " \n", - "\n", - "\n", - " \n", - " \n", - "\n", - "\n", - " \n", - " \n", - "\n", - "\n", - " \n", - " \n", - "\n", - "\n", - " \n", - " \n", - "\n", - "\n", - " \n", - " \n", - "\n", - "\n", - " \n", - " \n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - " \n", - "\n", - "\n", - " \n", - " \n", - " \n", - "\n", - "\n", - " \n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "\n", - "\n", - " \n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - "\n", - "\n", - "\n", - "\n" - ], - "text/plain": [ - "TikzPictures.TikzPicture(\"\\\\begin{axis}[ylabel = {Predicted value}, xlabel = {True value}]\\\\addplot+ [only marks = {true}, black,fill=black]coordinates {\\n(17.3, 16.16)\\n(18.5, 20.06)\\n(14.9, 16.53)\\n(31.1, 30.33)\\n(12.6, 14.81)\\n(29.6, 28.98)\\n(23.1, 20.375)\\n(19.1, 17.135)\\n(7.0, 11.995)\\n(19.4, 18.455)\\n(21.6, 21.96)\\n(14.2, 17.51)\\n(29.6, 24.155)\\n(21.8, 21.535)\\n(11.9, 13.685)\\n(12.1, 13.915)\\n(50.0, 42.27)\\n(22.2, 23.375)\\n(22.9, 25.955)\\n(48.3, 39.925)\\n(21.5, 19.17)\\n(21.6, 22.095)\\n(36.5, 39.24)\\n(19.6, 19.675)\\n(19.9, 19.015)\\n(16.3, 15.65)\\n(21.4, 21.455)\\n(19.1, 17.855)\\n(26.6, 26.665)\\n(37.2, 34.27)\\n(12.5, 14.41)\\n(20.4, 19.4)\\n(22.6, 24.845)\\n(28.2, 28.63)\\n(13.4, 16.415)\\n(23.3, 21.97)\\n(42.8, 33.12)\\n(21.2, 21.315)\\n(22.9, 22.205)\\n(50.0, 41.025)\\n(15.1, 13.56)\\n(19.5, 19.31)\\n(21.7, 21.485)\\n(23.3, 23.48)\\n(14.1, 14.785)\\n(20.5, 19.335)\\n(14.0, 14.805)\\n(43.1, 40.68)\\n(45.4, 42.215)\\n(36.4, 33.735)\\n(23.7, 25.125)\\n(31.5, 34.455)\\n(13.6, 15.09)\\n(16.4, 17.62)\\n(15.2, 15.815)\\n(27.9, 30.65)\\n(23.0, 23.805)\\n(19.8, 23.06)\\n(35.1, 36.04)\\n(17.8, 17.05)\\n(36.0, 37.325)\\n(18.4, 17.465)\\n(10.5, 10.97)\\n(50.0, 45.395)\\n(20.1, 19.04)\\n(23.9, 26.19)\\n(48.8, 45.8)\\n(19.3, 19.285)\\n(14.5, 15.57)\\n(24.2, 24.31)\\n(29.8, 26.695)\\n(23.3, 24.445)\\n(14.5, 17.185)\\n(15.6, 16.67)\\n(8.8, 14.135)\\n(19.6, 21.145)\\n(21.9, 20.5)\\n(15.0, 19.445)\\n(32.0, 32.375)\\n(20.1, 17.56)\\n(7.2, 13.37)\\n(23.5, 24.96)\\n(33.0, 30.66)\\n(26.6, 28.865)\\n(5.0, 11.065)\\n(17.1, 17.845)\\n(16.8, 18.715)\\n(29.9, 31.725)\\n(19.3, 20.275)\\n(20.3, 20.175)\\n(29.1, 29.985)\\n(27.5, 27.755)\\n(10.9, 13.885)\\n(24.4, 25.03)\\n(22.5, 22.195)\\n(14.8, 16.04)\\n(18.2, 18.455)\\n(24.8, 24.005)\\n(18.5, 19.81)\\n(23.8, 24.795)\\n(17.6, 20.35)\\n(24.0, 22.37)\\n(17.5, 17.48)\\n(23.2, 19.765)\\n(20.8, 21.305)\\n(15.0, 14.64)\\n(24.1, 25.81)\\n(18.9, 19.15)\\n(25.0, 23.425)\\n(31.6, 37.435)\\n(16.0, 17.225)\\n(30.5, 30.145)\\n(32.4, 31.14)\\n(23.3, 21.43)\\n(33.1, 32.105)\\n(50.0, 42.785)\\n(19.2, 20.035)\\n(23.2, 23.675)\\n(27.5, 26.46)\\n(23.0, 21.44)\\n(8.4, 14.4)\\n(17.4, 18.765)\\n(23.8, 24.22)\\n(22.9, 23.465)\\n(22.4, 21.075)\\n(19.6, 19.155)\\n(31.0, 32.37)\\n(41.7, 39.38)\\n(27.9, 21.31)\\n(24.4, 24.875)\\n(36.1, 33.3)\\n(15.3, 18.52)\\n(16.6, 16.88)\\n(20.6, 20.1)\\n(32.0, 31.07)\\n(24.5, 27.57)\\n(23.1, 22.325)\\n(50.0, 42.715)\\n(12.0, 14.645)\\n(18.2, 18.19)\\n(19.4, 20.21)\\n(11.5, 12.285)\\n(14.1, 15.085)\\n(44.8, 41.18)\\n(17.1, 16.2)\\n(8.1, 11.26)\\n(28.0, 28.965)\\n(15.7, 15.73)\\n(23.7, 23.465)\\n(36.2, 34.47)\\n(19.3, 20.08)\\n(21.4, 23.98)\\n(29.1, 28.37)\\n(16.7, 16.875)\\n(34.9, 34.835)\\n(26.4, 25.86)\\n(20.6, 21.595)\\n(17.0, 17.995)\\n(14.5, 15.785)\\n(33.4, 34.21)\\n(19.8, 20.195)\\n(23.0, 22.175)\\n(15.6, 16.355)\\n(22.5, 21.7)\\n(21.2, 21.14)\\n(20.5, 22.075)\\n(23.7, 23.295)\\n(20.9, 21.14)\\n(17.9, 14.62)\\n(10.2, 12.275)\\n(24.7, 23.795)\\n(30.3, 28.185)\\n(13.8, 16.315)\\n(13.4, 14.305)\\n(23.6, 24.28)\\n(23.1, 19.78)\\n(22.7, 22.565)\\n(21.7, 19.34)\\n(14.1, 16.07)\\n(27.5, 19.025)\\n(18.7, 18.545)\\n(34.6, 31.89)\\n(14.9, 14.97)\\n(12.8, 12.105)\\n(11.8, 13.605)\\n(25.0, 24.635)\\n(16.2, 18.275)\\n(19.2, 19.72)\\n(29.8, 32.05)\\n(14.4, 17.375)\\n(28.4, 28.195)\\n(20.6, 21.215)\\n(25.0, 24.49)\\n(20.4, 21.29)\\n(24.1, 24.635)\\n(7.4, 9.825)\\n(10.4, 12.015)\\n(20.8, 21.53)\\n(13.8, 15.675)\\n(42.3, 37.92)\\n(20.0, 19.32)\\n(22.2, 21.965)\\n(50.0, 42.65)\\n(25.3, 24.81)\\n(48.5, 43.96)\\n(23.2, 23.805)\\n(21.0, 21.61)\\n(31.7, 36.51)\\n(23.1, 23.495)\\n(20.7, 21.335)\\n(30.1, 28.825)\\n(17.4, 19.06)\\n(22.0, 24.105)\\n(16.5, 18.335)\\n(50.0, 42.355)\\n(38.7, 36.585)\\n(17.4, 18.865)\\n(11.7, 13.915)\\n(19.5, 18.01)\\n(21.4, 20.365)\\n(18.5, 19.115)\\n(20.3, 20.125)\\n(13.0, 14.855)\\n(15.4, 15.565)\\n(21.7, 21.95)\\n(20.1, 21.455)\\n(21.1, 21.46)\\n(28.7, 26.56)\\n(19.0, 17.405)\\n(30.1, 28.755)\\n(22.6, 22.555)\\n(23.4, 23.7)\\n(16.7, 19.135)\\n(13.1, 16.27)\\n(24.3, 22.61)\\n(20.4, 20.07)\\n(10.2, 13.285)\\n(19.3, 20.22)\\n(33.2, 34.325)\\n(37.3, 34.65)\\n(22.6, 23.925)\\n(21.4, 19.485)\\n(33.1, 32.96)\\n(17.5, 18.945)\\n(8.3, 11.945)\\n(22.0, 22.905)\\n(13.1, 13.635)\\n(16.2, 15.82)\\n(50.0, 42.515)\\n(13.4, 13.565)\\n(21.7, 21.155)\\n(19.7, 20.775)\\n(19.6, 21.05)\\n(17.8, 16.145)\\n(22.6, 23.165)\\n(50.0, 43.52)\\n(17.8, 17.09)\\n(21.5, 20.475)\\n(20.1, 21.065)\\n(18.2, 21.32)\\n(9.6, 12.43)\\n(12.3, 13.045)\\n(22.6, 21.19)\\n(17.5, 18.8)\\n(17.8, 18.14)\\n(25.2, 24.715)\\n(13.8, 12.54)\\n(13.9, 15.74)\\n(20.2, 19.795)\\n(19.4, 18.865)\\n(13.3, 14.945)\\n(23.8, 22.93)\\n(13.6, 16.06)\\n(25.0, 23.805)\\n(13.9, 15.01)\\n(20.2, 19.425)\\n(32.5, 29.97)\\n(23.1, 24.32)\\n(18.4, 17.245)\\n(24.4, 24.41)\\n(13.4, 14.955)\\n(34.9, 33.5)\\n(30.1, 34.53)\\n(22.0, 21.205)\\n(29.0, 28.77)\\n(21.7, 21.995)\\n(13.5, 15.64)\\n(11.8, 15.425)\\n(18.8, 18.945)\\n(34.7, 34.84)\\n(34.9, 33.93)\\n(24.3, 24.515)\\n(28.4, 27.18)\\n(33.8, 38.055)\\n(18.9, 20.035)\\n(28.1, 25.57)\\n(21.2, 21.465)\\n(22.8, 23.905)\\n(18.7, 19.47)\\n(32.9, 30.655)\\n(10.2, 12.525)\\n(39.8, 37.265)\\n(50.0, 44.895)\\n(12.7, 14.31)\\n(20.8, 17.61)\\n(21.4, 22.185)\\n(18.6, 20.075)\\n(27.5, 23.695)\\n(24.1, 25.045)\\n(22.8, 28.245)\\n(14.6, 16.77)\\n(15.6, 16.565)\\n(25.1, 27.42)\\n(22.0, 24.265)\\n(17.2, 18.43)\\n(32.7, 33.105)\\n(13.5, 14.62)\\n(31.6, 31.35)\\n(20.0, 20.81)\\n(19.4, 20.27)\\n(22.7, 21.4)\\n(23.7, 24.035)\\n(20.6, 20.67)\\n(29.0, 29.435)\\n(24.8, 26.385)\\n(10.5, 10.68)\\n(9.7, 11.1)\\n(43.8, 40.79)\\n(24.7, 24.28)\\n(22.0, 20.85)\\n(21.0, 20.44)\\n(12.7, 16.47)\\n(24.5, 22.43)\\n(15.4, 16.455)\\n(27.0, 26.635)\\n(18.3, 19.06)\\n(7.2, 9.83)\\n(14.3, 15.295)\\n(19.1, 19.59)\\n(24.5, 25.215)\\n(19.5, 18.875)\\n(7.5, 13.03)\\n(5.6, 10.62)\\n(35.2, 32.845)\\n(32.2, 28.81)\\n(22.3, 23.355)\\n(25.0, 22.855)\\n(29.4, 28.715)\\n(23.0, 23.315)\\n(22.4, 22.15)\\n(46.0, 44.385)\\n(19.9, 18.24)\\n(21.1, 21.74)\\n(21.2, 20.955)\\n};\\n\\\\addplot+ [mark = {none}, red]coordinates {\\n(5.0, 5.0)\\n(5.6, 5.6)\\n(7.0, 7.0)\\n(7.2, 7.2)\\n(7.4, 7.4)\\n(7.5, 7.5)\\n(8.1, 8.1)\\n(8.3, 8.3)\\n(8.4, 8.4)\\n(8.8, 8.8)\\n(9.6, 9.6)\\n(9.7, 9.7)\\n(10.2, 10.2)\\n(10.4, 10.4)\\n(10.5, 10.5)\\n(10.9, 10.9)\\n(11.5, 11.5)\\n(11.7, 11.7)\\n(11.8, 11.8)\\n(11.9, 11.9)\\n(12.0, 12.0)\\n(12.1, 12.1)\\n(12.3, 12.3)\\n(12.5, 12.5)\\n(12.6, 12.6)\\n(12.7, 12.7)\\n(12.8, 12.8)\\n(13.0, 13.0)\\n(13.1, 13.1)\\n(13.3, 13.3)\\n(13.4, 13.4)\\n(13.5, 13.5)\\n(13.6, 13.6)\\n(13.8, 13.8)\\n(13.9, 13.9)\\n(14.0, 14.0)\\n(14.1, 14.1)\\n(14.2, 14.2)\\n(14.3, 14.3)\\n(14.4, 14.4)\\n(14.5, 14.5)\\n(14.6, 14.6)\\n(14.8, 14.8)\\n(14.9, 14.9)\\n(15.0, 15.0)\\n(15.1, 15.1)\\n(15.2, 15.2)\\n(15.3, 15.3)\\n(15.4, 15.4)\\n(15.6, 15.6)\\n(15.7, 15.7)\\n(16.0, 16.0)\\n(16.2, 16.2)\\n(16.3, 16.3)\\n(16.4, 16.4)\\n(16.5, 16.5)\\n(16.6, 16.6)\\n(16.7, 16.7)\\n(16.8, 16.8)\\n(17.0, 17.0)\\n(17.1, 17.1)\\n(17.2, 17.2)\\n(17.3, 17.3)\\n(17.4, 17.4)\\n(17.5, 17.5)\\n(17.6, 17.6)\\n(17.8, 17.8)\\n(17.9, 17.9)\\n(18.2, 18.2)\\n(18.3, 18.3)\\n(18.4, 18.4)\\n(18.5, 18.5)\\n(18.6, 18.6)\\n(18.7, 18.7)\\n(18.8, 18.8)\\n(18.9, 18.9)\\n(19.0, 19.0)\\n(19.1, 19.1)\\n(19.2, 19.2)\\n(19.3, 19.3)\\n(19.4, 19.4)\\n(19.5, 19.5)\\n(19.6, 19.6)\\n(19.7, 19.7)\\n(19.8, 19.8)\\n(19.9, 19.9)\\n(20.0, 20.0)\\n(20.1, 20.1)\\n(20.2, 20.2)\\n(20.3, 20.3)\\n(20.4, 20.4)\\n(20.5, 20.5)\\n(20.6, 20.6)\\n(20.7, 20.7)\\n(20.8, 20.8)\\n(20.9, 20.9)\\n(21.0, 21.0)\\n(21.1, 21.1)\\n(21.2, 21.2)\\n(21.4, 21.4)\\n(21.5, 21.5)\\n(21.6, 21.6)\\n(21.7, 21.7)\\n(21.8, 21.8)\\n(21.9, 21.9)\\n(22.0, 22.0)\\n(22.2, 22.2)\\n(22.3, 22.3)\\n(22.4, 22.4)\\n(22.5, 22.5)\\n(22.6, 22.6)\\n(22.7, 22.7)\\n(22.8, 22.8)\\n(22.9, 22.9)\\n(23.0, 23.0)\\n(23.1, 23.1)\\n(23.2, 23.2)\\n(23.3, 23.3)\\n(23.4, 23.4)\\n(23.5, 23.5)\\n(23.6, 23.6)\\n(23.7, 23.7)\\n(23.8, 23.8)\\n(23.9, 23.9)\\n(24.0, 24.0)\\n(24.1, 24.1)\\n(24.2, 24.2)\\n(24.3, 24.3)\\n(24.4, 24.4)\\n(24.5, 24.5)\\n(24.7, 24.7)\\n(24.8, 24.8)\\n(25.0, 25.0)\\n(25.1, 25.1)\\n(25.2, 25.2)\\n(25.3, 25.3)\\n(26.4, 26.4)\\n(26.6, 26.6)\\n(27.0, 27.0)\\n(27.5, 27.5)\\n(27.9, 27.9)\\n(28.0, 28.0)\\n(28.1, 28.1)\\n(28.2, 28.2)\\n(28.4, 28.4)\\n(28.7, 28.7)\\n(29.0, 29.0)\\n(29.1, 29.1)\\n(29.4, 29.4)\\n(29.6, 29.6)\\n(29.8, 29.8)\\n(29.9, 29.9)\\n(30.1, 30.1)\\n(30.3, 30.3)\\n(30.5, 30.5)\\n(31.0, 31.0)\\n(31.1, 31.1)\\n(31.5, 31.5)\\n(31.6, 31.6)\\n(31.7, 31.7)\\n(32.0, 32.0)\\n(32.2, 32.2)\\n(32.4, 32.4)\\n(32.5, 32.5)\\n(32.7, 32.7)\\n(32.9, 32.9)\\n(33.0, 33.0)\\n(33.1, 33.1)\\n(33.2, 33.2)\\n(33.4, 33.4)\\n(33.8, 33.8)\\n(34.6, 34.6)\\n(34.7, 34.7)\\n(34.9, 34.9)\\n(35.1, 35.1)\\n(35.2, 35.2)\\n(36.0, 36.0)\\n(36.1, 36.1)\\n(36.2, 36.2)\\n(36.4, 36.4)\\n(36.5, 36.5)\\n(37.2, 37.2)\\n(37.3, 37.3)\\n(38.7, 38.7)\\n(39.8, 39.8)\\n(41.7, 41.7)\\n(42.3, 42.3)\\n(42.8, 42.8)\\n(43.1, 43.1)\\n(43.8, 43.8)\\n(44.8, 44.8)\\n(45.4, 45.4)\\n(46.0, 46.0)\\n(48.3, 48.3)\\n(48.5, 48.5)\\n(48.8, 48.8)\\n(50.0, 50.0)\\n};\\n\\\\end{axis}\\n\", \"\", \"\\\\usepackage{pgfplots}\\n\\\\pgfplotsset{compat=newest}\\n\\\\pgfplotsset{every axis legend/.append style={%\\ncells={anchor=west}}\\n}\\n\\\\usepgfplotslibrary{polar}\\n\\\\usetikzlibrary{arrows}\\n\\\\tikzset{>=stealth'}\\n\", true, true)" - ] - }, - "execution_count": 12, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# Plot true values versus predicted values for random forest on training set\n", - "randomforestreg_plot_training = PredictMD.plotsinglelabelregressiontrueversuspredicted(\n", - " randomforestreg,\n", - " training_features_df,\n", - " traininglabels_df,\n", - " labelname,\n", - " )" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "metadata": {}, - "outputs": [ - { - "data": { - "image/svg+xml": [ - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - " \n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - " \n", - " \n", - "\n", - "\n", - " \n", - " \n", - "\n", - "\n", - " \n", - " \n", - "\n", - "\n", - " \n", - " \n", - "\n", - "\n", - " \n", - " \n", - "\n", - "\n", - " \n", - " \n", - "\n", - "\n", - " \n", - " \n", - "\n", - "\n", - " \n", - " \n", - "\n", - "\n", - " \n", - " \n", - "\n", - "\n", - " \n", - " \n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - " \n", - "\n", - "\n", - " \n", - " \n", - " \n", - "\n", - "\n", - " \n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "\n", - "\n", - " \n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - "\n", - "\n", - "\n", - "\n" - ], - "text/plain": [ - "TikzPictures.TikzPicture(\"\\\\begin{axis}[ylabel = {Predicted value}, xlabel = {True value}]\\\\addplot+ [only marks = {true}, black,fill=black]coordinates {\\n(10.4, 15.655)\\n(50.0, 42.575)\\n(50.0, 26.14)\\n(15.2, 17.475)\\n(26.2, 26.535)\\n(5.0, 12.49)\\n(7.0, 13.085)\\n(27.1, 19.725)\\n(35.4, 37.745)\\n(17.7, 18.495)\\n(8.8, 12.785)\\n(9.5, 13.78)\\n(31.2, 30.635)\\n(50.0, 25.475)\\n(8.4, 16.425)\\n(14.6, 17.86)\\n(19.7, 20.5)\\n(23.8, 22.315)\\n(20.0, 17.405)\\n(21.2, 23.525)\\n(30.7, 28.055)\\n(15.6, 16.8)\\n(50.0, 21.98)\\n(17.2, 16.685)\\n(19.6, 17.705)\\n(17.2, 15.59)\\n(18.1, 17.855)\\n(25.0, 24.355)\\n(22.2, 21.775)\\n(30.8, 32.295)\\n(37.9, 33.525)\\n(22.8, 23.43)\\n(20.4, 18.98)\\n(15.2, 15.38)\\n(41.3, 36.82)\\n(12.7, 13.695)\\n(20.1, 20.025)\\n(36.2, 23.21)\\n(21.9, 25.535)\\n(16.6, 20.3)\\n(24.4, 22.655)\\n(8.3, 12.64)\\n(35.4, 32.585)\\n(18.0, 18.625)\\n(23.2, 20.135)\\n(24.7, 24.035)\\n(22.9, 23.54)\\n(28.5, 32.22)\\n(8.5, 12.43)\\n(25.0, 25.815)\\n(24.6, 27.1)\\n(13.1, 12.425)\\n(14.3, 15.985)\\n(7.2, 12.135)\\n(28.6, 29.065)\\n(23.4, 22.1)\\n(14.9, 17.615)\\n(10.9, 16.11)\\n(11.0, 15.26)\\n(8.5, 12.77)\\n(28.7, 32.425)\\n(20.0, 18.86)\\n(23.9, 22.825)\\n(15.6, 17.785)\\n(43.5, 39.0)\\n(44.0, 40.805)\\n(22.1, 23.6)\\n(20.5, 21.75)\\n(24.8, 29.37)\\n(22.3, 22.02)\\n(15.0, 20.32)\\n(13.3, 16.13)\\n(11.9, 21.195)\\n(22.0, 26.16)\\n(18.8, 19.015)\\n(18.7, 21.345)\\n(23.1, 24.985)\\n(20.0, 20.595)\\n(21.7, 20.145)\\n(37.6, 41.535)\\n(37.0, 29.905)\\n(24.8, 25.21)\\n(26.7, 29.51)\\n(19.3, 20.16)\\n(16.1, 17.6)\\n(20.3, 20.97)\\n(21.0, 20.095)\\n(10.8, 15.3)\\n(50.0, 39.755)\\n(20.6, 22.075)\\n(22.2, 21.32)\\n(14.4, 15.48)\\n(23.6, 31.995)\\n(18.9, 22.05)\\n(22.0, 25.565)\\n(26.6, 29.36)\\n(20.6, 21.775)\\n(23.9, 25.89)\\n(24.6, 24.765)\\n(6.3, 13.965)\\n(21.7, 20.04)\\n(11.7, 13.905)\\n(33.4, 31.335)\\n(24.3, 22.26)\\n(13.1, 15.245)\\n(26.5, 28.045)\\n(20.9, 22.765)\\n(50.0, 39.295)\\n(23.9, 29.32)\\n(8.7, 14.03)\\n(13.2, 16.075)\\n(18.3, 21.085)\\n(21.8, 20.575)\\n(18.6, 24.175)\\n(19.5, 21.455)\\n(24.0, 27.765)\\n(22.5, 26.135)\\n(26.4, 21.47)\\n(16.8, 19.44)\\n(33.2, 32.93)\\n(21.9, 32.355)\\n(23.9, 29.505)\\n(46.7, 36.855)\\n(17.1, 18.345)\\n(16.5, 24.65)\\n(25.0, 27.1)\\n(20.7, 21.21)\\n(19.8, 18.765)\\n(22.8, 25.48)\\n(18.4, 20.265)\\n(18.5, 21.555)\\n(31.5, 32.52)\\n(13.3, 15.645)\\n(19.9, 19.41)\\n(22.2, 24.125)\\n(16.1, 21.09)\\n(17.8, 19.79)\\n(27.1, 25.22)\\n(23.1, 21.585)\\n(19.1, 20.585)\\n(33.3, 34.27)\\n(11.3, 12.49)\\n(28.7, 28.435)\\n(19.4, 18.59)\\n(19.9, 18.835)\\n(13.8, 12.295)\\n(19.4, 22.1)\\n(20.3, 23.185)\\n(13.8, 14.28)\\n(16.1, 21.875)\\n(18.9, 19.97)\\n(19.0, 21.605)\\n};\\n\\\\addplot+ [mark = {none}, red]coordinates {\\n(5.0, 5.0)\\n(6.3, 6.3)\\n(7.0, 7.0)\\n(7.2, 7.2)\\n(8.3, 8.3)\\n(8.4, 8.4)\\n(8.5, 8.5)\\n(8.7, 8.7)\\n(8.8, 8.8)\\n(9.5, 9.5)\\n(10.4, 10.4)\\n(10.8, 10.8)\\n(10.9, 10.9)\\n(11.0, 11.0)\\n(11.3, 11.3)\\n(11.7, 11.7)\\n(11.9, 11.9)\\n(12.7, 12.7)\\n(13.1, 13.1)\\n(13.2, 13.2)\\n(13.3, 13.3)\\n(13.8, 13.8)\\n(14.3, 14.3)\\n(14.4, 14.4)\\n(14.6, 14.6)\\n(14.9, 14.9)\\n(15.0, 15.0)\\n(15.2, 15.2)\\n(15.6, 15.6)\\n(16.1, 16.1)\\n(16.5, 16.5)\\n(16.6, 16.6)\\n(16.8, 16.8)\\n(17.1, 17.1)\\n(17.2, 17.2)\\n(17.7, 17.7)\\n(17.8, 17.8)\\n(18.0, 18.0)\\n(18.1, 18.1)\\n(18.3, 18.3)\\n(18.4, 18.4)\\n(18.5, 18.5)\\n(18.6, 18.6)\\n(18.7, 18.7)\\n(18.8, 18.8)\\n(18.9, 18.9)\\n(19.0, 19.0)\\n(19.1, 19.1)\\n(19.3, 19.3)\\n(19.4, 19.4)\\n(19.5, 19.5)\\n(19.6, 19.6)\\n(19.7, 19.7)\\n(19.8, 19.8)\\n(19.9, 19.9)\\n(20.0, 20.0)\\n(20.1, 20.1)\\n(20.3, 20.3)\\n(20.4, 20.4)\\n(20.5, 20.5)\\n(20.6, 20.6)\\n(20.7, 20.7)\\n(20.9, 20.9)\\n(21.0, 21.0)\\n(21.2, 21.2)\\n(21.7, 21.7)\\n(21.8, 21.8)\\n(21.9, 21.9)\\n(22.0, 22.0)\\n(22.1, 22.1)\\n(22.2, 22.2)\\n(22.3, 22.3)\\n(22.5, 22.5)\\n(22.8, 22.8)\\n(22.9, 22.9)\\n(23.1, 23.1)\\n(23.2, 23.2)\\n(23.4, 23.4)\\n(23.6, 23.6)\\n(23.8, 23.8)\\n(23.9, 23.9)\\n(24.0, 24.0)\\n(24.3, 24.3)\\n(24.4, 24.4)\\n(24.6, 24.6)\\n(24.7, 24.7)\\n(24.8, 24.8)\\n(25.0, 25.0)\\n(26.2, 26.2)\\n(26.4, 26.4)\\n(26.5, 26.5)\\n(26.6, 26.6)\\n(26.7, 26.7)\\n(27.1, 27.1)\\n(28.5, 28.5)\\n(28.6, 28.6)\\n(28.7, 28.7)\\n(30.7, 30.7)\\n(30.8, 30.8)\\n(31.2, 31.2)\\n(31.5, 31.5)\\n(33.2, 33.2)\\n(33.3, 33.3)\\n(33.4, 33.4)\\n(35.4, 35.4)\\n(36.2, 36.2)\\n(37.0, 37.0)\\n(37.6, 37.6)\\n(37.9, 37.9)\\n(41.3, 41.3)\\n(43.5, 43.5)\\n(44.0, 44.0)\\n(46.7, 46.7)\\n(50.0, 50.0)\\n};\\n\\\\end{axis}\\n\", \"\", \"\\\\usepackage{pgfplots}\\n\\\\pgfplotsset{compat=newest}\\n\\\\pgfplotsset{every axis legend/.append style={%\\ncells={anchor=west}}\\n}\\n\\\\usepgfplotslibrary{polar}\\n\\\\usetikzlibrary{arrows}\\n\\\\tikzset{>=stealth'}\\n\", true, true)" - ] - }, - "execution_count": 13, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# Plot true values versus predicted values for random forest on testing set\n", - "randomforestreg_plot_testing = PredictMD.plotsinglelabelregressiontrueversuspredicted(\n", - " randomforestreg,\n", - " testing_features_df,\n", - " testing_labels_df,\n", - " labelname,\n", - " )" - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
metricRandom forest
1R^2 (coefficient of determination)0.929843
" - ], - "text/plain": [ - "1×2 DataFrames.DataFrame\n", - "│ Row │ metric │ Random forest │\n", - "├─────┼────────────────────────────────────┼───────────────┤\n", - "│ 1 │ R^2 (coefficient of determination) │ 0.929843 │" - ] - }, - "execution_count": 14, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# Evaluate performance of random forest on training set\n", - "PredictMD.singlelabelregressionmetrics(\n", - " randomforestreg,\n", - " training_features_df,\n", - " traininglabels_df,\n", - " labelname,\n", - " )" - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
metricRandom forest
1R^2 (coefficient of determination)0.699748
" - ], - "text/plain": [ - "1×2 DataFrames.DataFrame\n", - "│ Row │ metric │ Random forest │\n", - "├─────┼────────────────────────────────────┼───────────────┤\n", - "│ 1 │ R^2 (coefficient of determination) │ 0.699748 │" - ] - }, - "execution_count": 15, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# Evaluate performance of random forest on testing set\n", - "PredictMD.singlelabelregressionmetrics(\n", - " randomforestreg,\n", - " testing_features_df,\n", - " testing_labels_df,\n", - " labelname,\n", - " )" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Section 4: Save trained models to file (if desired) " - ] - }, - { - "cell_type": "code", - "execution_count": 16, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "\u001b[1m\u001b[36mINFO: \u001b[39m\u001b[22m\u001b[36mSaved model to file ./randomforestreg.jld2\n", - "\u001b[39m" - ] - } - ], - "source": [ - "if save_trained\n", - " PredictMD.save(randomforestreg_filename, randomforestreg)\n", - "end" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Appendix A: Directly access the output of regression models " - ] - }, - { - "cell_type": "code", - "execution_count": 18, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
MedV
116.16
220.06
316.53
430.33
514.81
628.98
720.375
817.135
911.995
1018.455
1121.96
1217.51
1324.155
1421.535
1513.685
1613.915
1742.27
1823.375
1925.955
2039.925
2119.17
2222.095
2339.24
2419.675
2519.015
2615.65
2721.455
2817.855
2926.665
3034.27
" - ], - "text/plain": [ - "354×1 DataFrames.DataFrame\n", - "│ Row │ MedV │\n", - "├─────┼────────┤\n", - "│ 1 │ 16.16 │\n", - "│ 2 │ 20.06 │\n", - "│ 3 │ 16.53 │\n", - "│ 4 │ 30.33 │\n", - "│ 5 │ 14.81 │\n", - "│ 6 │ 28.98 │\n", - "│ 7 │ 20.375 │\n", - "│ 8 │ 17.135 │\n", - "│ 9 │ 11.995 │\n", - "│ 10 │ 18.455 │\n", - "│ 11 │ 21.96 │\n", - "⋮\n", - "│ 343 │ 10.62 │\n", - "│ 344 │ 32.845 │\n", - "│ 345 │ 28.81 │\n", - "│ 346 │ 23.355 │\n", - "│ 347 │ 22.855 │\n", - "│ 348 │ 28.715 │\n", - "│ 349 │ 23.315 │\n", - "│ 350 │ 22.15 │\n", - "│ 351 │ 44.385 │\n", - "│ 352 │ 18.24 │\n", - "│ 353 │ 21.74 │\n", - "│ 354 │ 20.955 │" - ] - }, - "execution_count": 18, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# We can use the PredictMD.predict() function to get the real-valued predictions\n", - "# output by each of regression models.\n", - "\n", - "# Get real-valued predictions from each model for training set\n", - "PredictMD.predict(randomforestreg,training_features_df)" - ] - }, - { - "cell_type": "code", - "execution_count": 20, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
MedV
115.655
242.575
326.14
417.475
526.535
612.49
713.085
819.725
937.745
1018.495
1112.785
1213.78
1330.635
1425.475
1516.425
1617.86
1720.5
1822.315
1917.405
2023.525
2128.055
2216.8
2321.98
2416.685
2517.705
2615.59
2717.855
2824.355
2921.775
3032.295
" - ], - "text/plain": [ - "152×1 DataFrames.DataFrame\n", - "│ Row │ MedV │\n", - "├─────┼────────┤\n", - "│ 1 │ 15.655 │\n", - "│ 2 │ 42.575 │\n", - "│ 3 │ 26.14 │\n", - "│ 4 │ 17.475 │\n", - "│ 5 │ 26.535 │\n", - "│ 6 │ 12.49 │\n", - "│ 7 │ 13.085 │\n", - "│ 8 │ 19.725 │\n", - "│ 9 │ 37.745 │\n", - "│ 10 │ 18.495 │\n", - "│ 11 │ 12.785 │\n", - "⋮\n", - "│ 141 │ 34.27 │\n", - "│ 142 │ 12.49 │\n", - "│ 143 │ 28.435 │\n", - "│ 144 │ 18.59 │\n", - "│ 145 │ 18.835 │\n", - "│ 146 │ 12.295 │\n", - "│ 147 │ 22.1 │\n", - "│ 148 │ 23.185 │\n", - "│ 149 │ 14.28 │\n", - "│ 150 │ 21.875 │\n", - "│ 151 │ 19.97 │\n", - "│ 152 │ 21.605 │" - ] - }, - "execution_count": 20, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# Get real-valued predictions from each model for testing set\n", - "PredictMD.predict(randomforestreg,testing_features_df)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Julia 0.6.2", - "language": "julia", - "name": "julia-0.6" - }, - "language_info": { - "file_extension": ".jl", - "mimetype": "application/julia", - "name": "julia", - "version": "0.6.2" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/examples_old/boston_housing/boston_housing_random_forest.jl b/examples_old/boston_housing/boston_housing_random_forest.jl deleted file mode 100644 index 8f22d3aca..000000000 --- a/examples_old/boston_housing/boston_housing_random_forest.jl +++ /dev/null @@ -1,143 +0,0 @@ - -# import required packages -import PredictMD -import CSV -import DataFrames -import GZip -import StatsBase - -# set the seed of the global random number generator -# this makes the results reproducible -srand(999) - -load_pretrained = false -save_trained = true - -# load_pretrained = true -# save_trained = false - -randomforestreg_filename = "./randomforestreg.jld2"; - -# Import Boston housing data -df = CSV.read( - GZip.gzopen(joinpath(Pkg.dir("RDatasets"),"data","MASS","Boston.csv.gz")), - DataFrames.DataFrame, - ) - -#take a quick look at file header and few rows -DataFrames.head(df) - -# Remove rows with missing data -DataFrames.dropmissing!(df) - -# Shuffle rows -PredictMD.shuffle_rows!(df) - -# Define labels -categoricalfeaturenames = Symbol[] - -continuousfeaturenames = Symbol[ - :Crim, - :Zn, - :Indus, - :Chas, - :NOx, - :Rm, - :Age, - :Dis, - :Rad, - :Tax, - :PTRatio, - :Black, - :LStat, - ] -featurenames = vcat(categoricalfeaturenames, continuousfeaturenames) - -if load_pretrained -else - contrasts = PredictMD.contrasts(df, featurenames) -end - -# Define labels -labelname = :MedV - -# Put features and labels in separate dataframes -features_df = df[featurenames] -labels_df = df[[labelname]] - -# Display for exploration -display(DataFrames.head(features_df)) -display(DataFrames.head(labels_df)) - -# View summary statistics for label variable (mean, quartiles, etc.) -DataFrames.describe(labels_df[labelname]) - -# Split data into training set (70%) and testing set (30%) -training_features_df,testing_features_df,traininglabels_df,testing_labels_df = - PredictMD.split_data(features_df,labels_df,0.7); - -# Set up random forest regression model -randomforestreg = PredictMD.singlelabeldataframerandomforestregression( - featurenames, - labelname; - nsubfeatures = 2, # number of subfeatures; defaults to 2 - ntrees = 20, # number of trees; defaults to 10 - package = :DecisionTreejl, - name = "Random forest" # optional - ) - -if load_pretrained - PredictMD.load!(randomforestreg_filename, randomforestreg) -else - # set feature contrasts - PredictMD.set_feature_contrasts!(randomforestreg , feature_contrasts) - # Train random forest model on training set - PredictMD.fit!(randomforestreg,training_features_df,traininglabels_df,) -end - -# Plot true values versus predicted values for random forest on training set -randomforestreg_plot_training = PredictMD.plotsinglelabelregressiontrueversuspredicted( - randomforestreg, - training_features_df, - traininglabels_df, - labelname, - ) - -# Plot true values versus predicted values for random forest on testing set -randomforestreg_plot_testing = PredictMD.plotsinglelabelregressiontrueversuspredicted( - randomforestreg, - testing_features_df, - testing_labels_df, - labelname, - ) - -# Evaluate performance of random forest on training set -PredictMD.singlelabelregressionmetrics( - randomforestreg, - training_features_df, - traininglabels_df, - labelname, - ) - -# Evaluate performance of random forest on testing set -PredictMD.singlelabelregressionmetrics( - randomforestreg, - testing_features_df, - testing_labels_df, - labelname, - ) - -if save_trained - PredictMD.save(randomforestreg_filename, randomforestreg) -end - -# We can use the PredictMD.predict() function to get the real-valued predictions -# output by each of regression models. - -# Get real-valued predictions from each model for training set -PredictMD.predict(randomforestreg,training_features_df) - -# Get real-valued predictions from each model for testing set -PredictMD.predict(randomforestreg,testing_features_df) - - diff --git a/examples_old/boston_housing/boston_housing_svm.ipynb b/examples_old/boston_housing/boston_housing_svm.ipynb deleted file mode 100644 index d3de3f5a2..000000000 --- a/examples_old/boston_housing/boston_housing_svm.ipynb +++ /dev/null @@ -1,2488 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Single-Label SVM Regression" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Section 1: Setup " - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "MersenneTwister(UInt32[0x000003e7], Base.dSFMT.DSFMT_state(Int32[-412893719, 1072748155, -748568654, 1073610384, -1271302057, 1073556021, -429186579, 1073162675, 932796209, 1073458022 … 1115928124, 1073598513, 1280798571, 1072732908, -581554620, 1977796709, 1774936613, -1100988421, 382, 0]), [1.67737, 1.24739, 1.02128, 1.82551, 1.39394, 1.48296, 1.96695, 1.01663, 1.3779, 1.92928 … 1.66935, 1.18935, 1.87348, 1.26745, 1.38246, 1.79401, 1.10363, 1.40961, 1.67252, 1.03805], 382)" - ] - }, - "execution_count": 1, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# import required packages\n", - "import PredictMD\n", - "import CSV\n", - "import DataFrames\n", - "import GZip\n", - "import Knet\n", - "import LIBSVM\n", - "import StatsBase\n", - "\n", - "# set the seed of the global random number generator\n", - "# this makes the results reproducible\n", - "srand(999)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Running for the first time\n", - "\n", - "If you are running this file for the first time and/or if you do not have\n", - "any trained models saved to disk, uncomment the lines below to train a model and save it to disk" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "true" - ] - }, - "execution_count": 2, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "load_pretrained = false\n", - "save_trained = true" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Using a pre-trained model\n", - "\n", - "If you already have trained models saved, and you would like to load those. Uncomment the lines below" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [], - "source": [ - "# load_pretrained = true\n", - "# save_trained = false" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Set your paths" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "\"./nusvr_svmreg.jld2\"" - ] - }, - "execution_count": 4, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "epsilonsvr_svmreg_filename = \"./epsilonsvr_svmreg.jld2\"\n", - "nusvr_svmreg_filename = \"./nusvr_svmreg.jld2\"" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Section 2: Prepare data " - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
CrimZnIndusChasNOxRmAgeDisRadTaxPTRatioBlackLStatMedV
10.0063218.02.3100.5386.57565.24.09129615.3396.94.9824.0
20.027310.07.0700.4696.42178.94.9671224217.8396.99.1421.6
30.027290.07.0700.4697.18561.14.9671224217.8392.834.0334.7
40.032370.02.1800.4586.99845.86.0622322218.7394.632.9433.4
50.069050.02.1800.4587.14754.26.0622322218.7396.95.3336.2
60.029850.02.1800.4586.4358.76.0622322218.7394.125.2128.7
" - ], - "text/plain": [ - "6×14 DataFrames.DataFrame. Omitted printing of 5 columns\n", - "│ Row │ Crim │ Zn │ Indus │ Chas │ NOx │ Rm │ Age │ Dis │ Rad │\n", - "├─────┼─────────┼──────┼───────┼──────┼───────┼───────┼──────┼────────┼─────┤\n", - "│ 1 │ 0.00632 │ 18.0 │ 2.31 │ 0 │ 0.538 │ 6.575 │ 65.2 │ 4.09 │ 1 │\n", - "│ 2 │ 0.02731 │ 0.0 │ 7.07 │ 0 │ 0.469 │ 6.421 │ 78.9 │ 4.9671 │ 2 │\n", - "│ 3 │ 0.02729 │ 0.0 │ 7.07 │ 0 │ 0.469 │ 7.185 │ 61.1 │ 4.9671 │ 2 │\n", - "│ 4 │ 0.03237 │ 0.0 │ 2.18 │ 0 │ 0.458 │ 6.998 │ 45.8 │ 6.0622 │ 3 │\n", - "│ 5 │ 0.06905 │ 0.0 │ 2.18 │ 0 │ 0.458 │ 7.147 │ 54.2 │ 6.0622 │ 3 │\n", - "│ 6 │ 0.02985 │ 0.0 │ 2.18 │ 0 │ 0.458 │ 6.43 │ 58.7 │ 6.0622 │ 3 │" - ] - }, - "execution_count": 5, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# Import Boston housing data\n", - "df = CSV.read(\n", - " GZip.gzopen(joinpath(Pkg.dir(\"RDatasets\"),\"data\",\"MASS\",\"Boston.csv.gz\")),\n", - " DataFrames.DataFrame,\n", - " )\n", - "\n", - "#take a quick look at file header and few rows\n", - "DataFrames.head(df)" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "13-element Array{Symbol,1}:\n", - " :Crim \n", - " :Zn \n", - " :Indus \n", - " :Chas \n", - " :NOx \n", - " :Rm \n", - " :Age \n", - " :Dis \n", - " :Rad \n", - " :Tax \n", - " :PTRatio\n", - " :Black \n", - " :LStat " - ] - }, - "execution_count": 6, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# Remove rows with missing data\n", - "DataFrames.dropmissing!(df)\n", - "\n", - "# Shuffle rows\n", - "PredictMD.shuffle_rows!(df)\n", - "\n", - "# Define labels\n", - "categoricalfeaturenames = Symbol[]\n", - "\n", - "continuousfeaturenames = Symbol[\n", - " :Crim,\n", - " :Zn,\n", - " :Indus,\n", - " :Chas,\n", - " :NOx,\n", - " :Rm,\n", - " :Age,\n", - " :Dis,\n", - " :Rad,\n", - " :Tax,\n", - " :PTRatio,\n", - " :Black,\n", - " :LStat,\n", - " ]\n", - "featurenames = vcat(categoricalfeaturenames, continuousfeaturenames)" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "PredictMD.ImmutableDataFrameFeatureContrasts(Symbol[:Crim, :Zn, :Indus, :Chas, :NOx, :Rm, :Age, :Dis, :Rad, :Tax, :PTRatio, :Black, :LStat], 13, Dict{Symbol,StatsModels.ContrastsMatrix}(), 13)" - ] - }, - "execution_count": 7, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "if load_pretrained\n", - "else\n", - " contrasts = PredictMD.contrasts(df, featurenames)\n", - "end" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
CrimZnIndusChasNOxRmAgeDisRadTaxPTRatioBlackLStat
188.97620.018.100.6716.96891.91.41652466620.2396.917.21
20.060470.02.4600.4886.15368.83.2797319317.8387.1113.15
30.17120.08.5600.525.83691.92.211538420.9395.6718.66
40.544520.021.8900.6246.15197.91.6687443721.2396.918.46
50.0346635.06.0600.43796.03123.36.6407130416.9362.257.83
60.0200995.02.6800.41618.03431.95.118422414.7390.552.88
" - ], - "text/plain": [ - "6×13 DataFrames.DataFrame. Omitted printing of 4 columns\n", - "│ Row │ Crim │ Zn │ Indus │ Chas │ NOx │ Rm │ Age │ Dis │ Rad │\n", - "├─────┼─────────┼──────┼───────┼──────┼────────┼───────┼──────┼────────┼─────┤\n", - "│ 1 │ 88.9762 │ 0.0 │ 18.1 │ 0 │ 0.671 │ 6.968 │ 91.9 │ 1.4165 │ 24 │\n", - "│ 2 │ 0.06047 │ 0.0 │ 2.46 │ 0 │ 0.488 │ 6.153 │ 68.8 │ 3.2797 │ 3 │\n", - "│ 3 │ 0.1712 │ 0.0 │ 8.56 │ 0 │ 0.52 │ 5.836 │ 91.9 │ 2.211 │ 5 │\n", - "│ 4 │ 0.54452 │ 0.0 │ 21.89 │ 0 │ 0.624 │ 6.151 │ 97.9 │ 1.6687 │ 4 │\n", - "│ 5 │ 0.03466 │ 35.0 │ 6.06 │ 0 │ 0.4379 │ 6.031 │ 23.3 │ 6.6407 │ 1 │\n", - "│ 6 │ 0.02009 │ 95.0 │ 2.68 │ 0 │ 0.4161 │ 8.034 │ 31.9 │ 5.118 │ 4 │" - ] - }, - "metadata": {}, - "output_type": "display_data", - "source": "julia" - }, - { - "data": { - "text/html": [ - "
MedV
110.4
229.6
319.5
417.8
519.4
650.0
" - ], - "text/plain": [ - "6×1 DataFrames.DataFrame\n", - "│ Row │ MedV │\n", - "├─────┼──────┤\n", - "│ 1 │ 10.4 │\n", - "│ 2 │ 29.6 │\n", - "│ 3 │ 19.5 │\n", - "│ 4 │ 17.8 │\n", - "│ 5 │ 19.4 │\n", - "│ 6 │ 50.0 │" - ] - }, - "metadata": {}, - "output_type": "display_data", - "source": "julia" - } - ], - "source": [ - "# Define labels\n", - "labelname = :MedV\n", - "\n", - "# Put features and labels in separate dataframes\n", - "features_df = df[featurenames]\n", - "labels_df = df[[labelname]]\n", - "\n", - "# Display for exploration\n", - "display(DataFrames.head(features_df))\n", - "display(DataFrames.head(labels_df))" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Summary Stats:\n", - "Mean: 22.532806\n", - "Minimum: 5.000000\n", - "1st Quartile: 17.025000\n", - "Median: 21.200000\n", - "3rd Quartile: 25.000000\n", - "Maximum: 50.000000\n", - "Length: 506\n", - "Type: Union{Float64, Missings.Missing}\n", - "Number Missing: 0\n", - "% Missing: 0.000000\n" - ] - } - ], - "source": [ - "# View summary statistics for label variable (mean, quartiles, etc.)\n", - "DataFrames.describe(labels_df[labelname])" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": {}, - "outputs": [], - "source": [ - "# Split data into training set (70%) and testing set (30%)\n", - "training_features_df,testing_features_df,traininglabels_df,testing_labels_df =\n", - " PredictMD.split_data(features_df,labels_df;training = 0.7,testing = 0.3,);" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Section 3: Set up and train models " - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Support vector machine (epsilon support vector regression) ################" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "\u001b[1m\u001b[36mINFO: \u001b[39m\u001b[22m\u001b[36mStarting to train LIBSVM.jl model.\n", - "\u001b[39m\u001b[1m\u001b[36mINFO: \u001b[39m\u001b[22m\u001b[36mFinished training LIBSVM.jl model.\n", - "\u001b[39m" - ] - }, - { - "data": { - "text/plain": [ - "PredictMD.LIBSVMModel(\"SVM (epsilon-SVR)\", false, true, Float64[], Dict{Any,Any}(Pair{Any,Any}(:gamma, 0.1),Pair{Any,Any}(:svmtype, LIBSVM.EpsilonSVR),Pair{Any,Any}(:degree, 3),Pair{Any,Any}(:tolerance, 0.001),Pair{Any,Any}(:nu, 0.5),Pair{Any,Any}(:cachesize, 100.0),Pair{Any,Any}(:epsilon, 0.1),Pair{Any,Any}(:kernel, Linear),Pair{Any,Any}(:verbose, false),Pair{Any,Any}(:cost, 1.0)…), LIBSVM.SVM{Float64}(LIBSVM.EpsilonSVR, Linear::LIBSVM.Kernel.KERNEL = 0, nothing, 13, 2, Float64[], Int32[], Float64[], Int32[], LIBSVM.SupportVectors{Union{Float64, Missings.Missing},Float64}(0, Int32[], Union{Float64, Missings.Missing}[], Array{Float64}(13,0), Int32[], LIBSVM.SVMNode[]), 0.0, Array{Float64}(0,1), Float64[], Float64[], [0.0], 3, 0.1, 100.0, 0.001, 1.0, 0.5, 0.1, true, false))" - ] - }, - "execution_count": 11, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# Set up epsilon-SVR model\n", - "epsilonsvr_svmreg = PredictMD.singlelabeldataframesvmregression(\n", - " featurenames,\n", - " labelname;\n", - " package = :LIBSVMjl,\n", - " svmtype = LIBSVM.EpsilonSVR,\n", - " name = \"SVM (epsilon-SVR)\",\n", - " kernel = LIBSVM.Kernel.Linear,\n", - " verbose = false,\n", - " )\n", - "\n", - "if load_pretrained\n", - " PredictMD.load!(epsilonsvr_svmreg_filename, epsilonsvr_svmreg)\n", - "else\n", - " # set feature contrasts\n", - " PredictMD.set_feature_contrasts!(epsilonsvr_svmreg, contrasts)\n", - " # Train epsilon-SVR model on training set\n", - " PredictMD.fit!(epsilonsvr_svmreg,training_features_df,traininglabels_df,)\n", - "end" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "metadata": {}, - "outputs": [ - { - "data": { - "image/svg+xml": [ - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - " \n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - " \n", - " \n", - "\n", - "\n", - " \n", - " \n", - "\n", - "\n", - " \n", - " \n", - "\n", - "\n", - " \n", - " \n", - "\n", - "\n", - " \n", - " \n", - "\n", - "\n", - " \n", - "\n", - "\n", - " \n", - " \n", - "\n", - "\n", - " \n", - " \n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - " \n", - "\n", - "\n", - " \n", - " \n", - " \n", - "\n", - "\n", - " \n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "\n", - "\n", - " \n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - "\n", - "\n", - "\n", - "\n" - ], - "text/plain": [ - "TikzPictures.TikzPicture(\"\\\\begin{axis}[ylabel = {Predicted value}, xlabel = {True value}]\\\\addplot+ [only marks = {true}, black,fill=black]coordinates {\\n(17.3, 0.0)\\n(18.5, 0.0)\\n(14.9, 0.0)\\n(31.1, 0.0)\\n(12.6, 0.0)\\n(29.6, 0.0)\\n(23.1, 0.0)\\n(19.1, 0.0)\\n(7.0, 0.0)\\n(19.4, 0.0)\\n(21.6, 0.0)\\n(14.2, 0.0)\\n(29.6, 0.0)\\n(21.8, 0.0)\\n(11.9, 0.0)\\n(12.1, 0.0)\\n(50.0, 0.0)\\n(22.2, 0.0)\\n(22.9, 0.0)\\n(48.3, 0.0)\\n(21.5, 0.0)\\n(21.6, 0.0)\\n(36.5, 0.0)\\n(19.6, 0.0)\\n(19.9, 0.0)\\n(16.3, 0.0)\\n(21.4, 0.0)\\n(19.1, 0.0)\\n(26.6, 0.0)\\n(37.2, 0.0)\\n(12.5, 0.0)\\n(20.4, 0.0)\\n(22.6, 0.0)\\n(28.2, 0.0)\\n(13.4, 0.0)\\n(23.3, 0.0)\\n(42.8, 0.0)\\n(21.2, 0.0)\\n(22.9, 0.0)\\n(50.0, 0.0)\\n(15.1, 0.0)\\n(19.5, 0.0)\\n(21.7, 0.0)\\n(23.3, 0.0)\\n(14.1, 0.0)\\n(20.5, 0.0)\\n(14.0, 0.0)\\n(43.1, 0.0)\\n(45.4, 0.0)\\n(36.4, 0.0)\\n(23.7, 0.0)\\n(31.5, 0.0)\\n(13.6, 0.0)\\n(16.4, 0.0)\\n(15.2, 0.0)\\n(27.9, 0.0)\\n(23.0, 0.0)\\n(19.8, 0.0)\\n(35.1, 0.0)\\n(17.8, 0.0)\\n(36.0, 0.0)\\n(18.4, 0.0)\\n(10.5, 0.0)\\n(50.0, 0.0)\\n(20.1, 0.0)\\n(23.9, 0.0)\\n(48.8, 0.0)\\n(19.3, 0.0)\\n(14.5, 0.0)\\n(24.2, 0.0)\\n(29.8, 0.0)\\n(23.3, 0.0)\\n(14.5, 0.0)\\n(15.6, 0.0)\\n(8.8, 0.0)\\n(19.6, 0.0)\\n(21.9, 0.0)\\n(15.0, 0.0)\\n(32.0, 0.0)\\n(20.1, 0.0)\\n(7.2, 0.0)\\n(23.5, 0.0)\\n(33.0, 0.0)\\n(26.6, 0.0)\\n(5.0, 0.0)\\n(17.1, 0.0)\\n(16.8, 0.0)\\n(29.9, 0.0)\\n(19.3, 0.0)\\n(20.3, 0.0)\\n(29.1, 0.0)\\n(27.5, 0.0)\\n(10.9, 0.0)\\n(24.4, 0.0)\\n(22.5, 0.0)\\n(14.8, 0.0)\\n(18.2, 0.0)\\n(24.8, 0.0)\\n(18.5, 0.0)\\n(23.8, 0.0)\\n(17.6, 0.0)\\n(24.0, 0.0)\\n(17.5, 0.0)\\n(23.2, 0.0)\\n(20.8, 0.0)\\n(15.0, 0.0)\\n(24.1, 0.0)\\n(18.9, 0.0)\\n(25.0, 0.0)\\n(31.6, 0.0)\\n(16.0, 0.0)\\n(30.5, 0.0)\\n(32.4, 0.0)\\n(23.3, 0.0)\\n(33.1, 0.0)\\n(50.0, 0.0)\\n(19.2, 0.0)\\n(23.2, 0.0)\\n(27.5, 0.0)\\n(23.0, 0.0)\\n(8.4, 0.0)\\n(17.4, 0.0)\\n(23.8, 0.0)\\n(22.9, 0.0)\\n(22.4, 0.0)\\n(19.6, 0.0)\\n(31.0, 0.0)\\n(41.7, 0.0)\\n(27.9, 0.0)\\n(24.4, 0.0)\\n(36.1, 0.0)\\n(15.3, 0.0)\\n(16.6, 0.0)\\n(20.6, 0.0)\\n(32.0, 0.0)\\n(24.5, 0.0)\\n(23.1, 0.0)\\n(50.0, 0.0)\\n(12.0, 0.0)\\n(18.2, 0.0)\\n(19.4, 0.0)\\n(11.5, 0.0)\\n(14.1, 0.0)\\n(44.8, 0.0)\\n(17.1, 0.0)\\n(8.1, 0.0)\\n(28.0, 0.0)\\n(15.7, 0.0)\\n(23.7, 0.0)\\n(36.2, 0.0)\\n(19.3, 0.0)\\n(21.4, 0.0)\\n(29.1, 0.0)\\n(16.7, 0.0)\\n(34.9, 0.0)\\n(26.4, 0.0)\\n(20.6, 0.0)\\n(17.0, 0.0)\\n(14.5, 0.0)\\n(33.4, 0.0)\\n(19.8, 0.0)\\n(23.0, 0.0)\\n(15.6, 0.0)\\n(22.5, 0.0)\\n(21.2, 0.0)\\n(20.5, 0.0)\\n(23.7, 0.0)\\n(20.9, 0.0)\\n(17.9, 0.0)\\n(10.2, 0.0)\\n(24.7, 0.0)\\n(30.3, 0.0)\\n(13.8, 0.0)\\n(13.4, 0.0)\\n(23.6, 0.0)\\n(23.1, 0.0)\\n(22.7, 0.0)\\n(21.7, 0.0)\\n(14.1, 0.0)\\n(27.5, 0.0)\\n(18.7, 0.0)\\n(34.6, 0.0)\\n(14.9, 0.0)\\n(12.8, 0.0)\\n(11.8, 0.0)\\n(25.0, 0.0)\\n(16.2, 0.0)\\n(19.2, 0.0)\\n(29.8, 0.0)\\n(14.4, 0.0)\\n(28.4, 0.0)\\n(20.6, 0.0)\\n(25.0, 0.0)\\n(20.4, 0.0)\\n(24.1, 0.0)\\n(7.4, 0.0)\\n(10.4, 0.0)\\n(20.8, 0.0)\\n(13.8, 0.0)\\n(42.3, 0.0)\\n(20.0, 0.0)\\n(22.2, 0.0)\\n(50.0, 0.0)\\n(25.3, 0.0)\\n(48.5, 0.0)\\n(23.2, 0.0)\\n(21.0, 0.0)\\n(31.7, 0.0)\\n(23.1, 0.0)\\n(20.7, 0.0)\\n(30.1, 0.0)\\n(17.4, 0.0)\\n(22.0, 0.0)\\n(16.5, 0.0)\\n(50.0, 0.0)\\n(38.7, 0.0)\\n(17.4, 0.0)\\n(11.7, 0.0)\\n(19.5, 0.0)\\n(21.4, 0.0)\\n(18.5, 0.0)\\n(20.3, 0.0)\\n(13.0, 0.0)\\n(15.4, 0.0)\\n(21.7, 0.0)\\n(20.1, 0.0)\\n(21.1, 0.0)\\n(28.7, 0.0)\\n(19.0, 0.0)\\n(30.1, 0.0)\\n(22.6, 0.0)\\n(23.4, 0.0)\\n(16.7, 0.0)\\n(13.1, 0.0)\\n(24.3, 0.0)\\n(20.4, 0.0)\\n(10.2, 0.0)\\n(19.3, 0.0)\\n(33.2, 0.0)\\n(37.3, 0.0)\\n(22.6, 0.0)\\n(21.4, 0.0)\\n(33.1, 0.0)\\n(17.5, 0.0)\\n(8.3, 0.0)\\n(22.0, 0.0)\\n(13.1, 0.0)\\n(16.2, 0.0)\\n(50.0, 0.0)\\n(13.4, 0.0)\\n(21.7, 0.0)\\n(19.7, 0.0)\\n(19.6, 0.0)\\n(17.8, 0.0)\\n(22.6, 0.0)\\n(50.0, 0.0)\\n(17.8, 0.0)\\n(21.5, 0.0)\\n(20.1, 0.0)\\n(18.2, 0.0)\\n(9.6, 0.0)\\n(12.3, 0.0)\\n(22.6, 0.0)\\n(17.5, 0.0)\\n(17.8, 0.0)\\n(25.2, 0.0)\\n(13.8, 0.0)\\n(13.9, 0.0)\\n(20.2, 0.0)\\n(19.4, 0.0)\\n(13.3, 0.0)\\n(23.8, 0.0)\\n(13.6, 0.0)\\n(25.0, 0.0)\\n(13.9, 0.0)\\n(20.2, 0.0)\\n(32.5, 0.0)\\n(23.1, 0.0)\\n(18.4, 0.0)\\n(24.4, 0.0)\\n(13.4, 0.0)\\n(34.9, 0.0)\\n(30.1, 0.0)\\n(22.0, 0.0)\\n(29.0, 0.0)\\n(21.7, 0.0)\\n(13.5, 0.0)\\n(11.8, 0.0)\\n(18.8, 0.0)\\n(34.7, 0.0)\\n(34.9, 0.0)\\n(24.3, 0.0)\\n(28.4, 0.0)\\n(33.8, 0.0)\\n(18.9, 0.0)\\n(28.1, 0.0)\\n(21.2, 0.0)\\n(22.8, 0.0)\\n(18.7, 0.0)\\n(32.9, 0.0)\\n(10.2, 0.0)\\n(39.8, 0.0)\\n(50.0, 0.0)\\n(12.7, 0.0)\\n(20.8, 0.0)\\n(21.4, 0.0)\\n(18.6, 0.0)\\n(27.5, 0.0)\\n(24.1, 0.0)\\n(22.8, 0.0)\\n(14.6, 0.0)\\n(15.6, 0.0)\\n(25.1, 0.0)\\n(22.0, 0.0)\\n(17.2, 0.0)\\n(32.7, 0.0)\\n(13.5, 0.0)\\n(31.6, 0.0)\\n(20.0, 0.0)\\n(19.4, 0.0)\\n(22.7, 0.0)\\n(23.7, 0.0)\\n(20.6, 0.0)\\n(29.0, 0.0)\\n(24.8, 0.0)\\n(10.5, 0.0)\\n(9.7, 0.0)\\n(43.8, 0.0)\\n(24.7, 0.0)\\n(22.0, 0.0)\\n(21.0, 0.0)\\n(12.7, 0.0)\\n(24.5, 0.0)\\n(15.4, 0.0)\\n(27.0, 0.0)\\n(18.3, 0.0)\\n(7.2, 0.0)\\n(14.3, 0.0)\\n(19.1, 0.0)\\n(24.5, 0.0)\\n(19.5, 0.0)\\n(7.5, 0.0)\\n(5.6, 0.0)\\n(35.2, 0.0)\\n(32.2, 0.0)\\n(22.3, 0.0)\\n(25.0, 0.0)\\n(29.4, 0.0)\\n(23.0, 0.0)\\n(22.4, 0.0)\\n(46.0, 0.0)\\n(19.9, 0.0)\\n(21.1, 0.0)\\n(21.2, 0.0)\\n};\\n\\\\addplot+ [mark = {none}, red]coordinates {\\n(5.0, 5.0)\\n(5.6, 5.6)\\n(7.0, 7.0)\\n(7.2, 7.2)\\n(7.4, 7.4)\\n(7.5, 7.5)\\n(8.1, 8.1)\\n(8.3, 8.3)\\n(8.4, 8.4)\\n(8.8, 8.8)\\n(9.6, 9.6)\\n(9.7, 9.7)\\n(10.2, 10.2)\\n(10.4, 10.4)\\n(10.5, 10.5)\\n(10.9, 10.9)\\n(11.5, 11.5)\\n(11.7, 11.7)\\n(11.8, 11.8)\\n(11.9, 11.9)\\n(12.0, 12.0)\\n(12.1, 12.1)\\n(12.3, 12.3)\\n(12.5, 12.5)\\n(12.6, 12.6)\\n(12.7, 12.7)\\n(12.8, 12.8)\\n(13.0, 13.0)\\n(13.1, 13.1)\\n(13.3, 13.3)\\n(13.4, 13.4)\\n(13.5, 13.5)\\n(13.6, 13.6)\\n(13.8, 13.8)\\n(13.9, 13.9)\\n(14.0, 14.0)\\n(14.1, 14.1)\\n(14.2, 14.2)\\n(14.3, 14.3)\\n(14.4, 14.4)\\n(14.5, 14.5)\\n(14.6, 14.6)\\n(14.8, 14.8)\\n(14.9, 14.9)\\n(15.0, 15.0)\\n(15.1, 15.1)\\n(15.2, 15.2)\\n(15.3, 15.3)\\n(15.4, 15.4)\\n(15.6, 15.6)\\n(15.7, 15.7)\\n(16.0, 16.0)\\n(16.2, 16.2)\\n(16.3, 16.3)\\n(16.4, 16.4)\\n(16.5, 16.5)\\n(16.6, 16.6)\\n(16.7, 16.7)\\n(16.8, 16.8)\\n(17.0, 17.0)\\n(17.1, 17.1)\\n(17.2, 17.2)\\n(17.3, 17.3)\\n(17.4, 17.4)\\n(17.5, 17.5)\\n(17.6, 17.6)\\n(17.8, 17.8)\\n(17.9, 17.9)\\n(18.2, 18.2)\\n(18.3, 18.3)\\n(18.4, 18.4)\\n(18.5, 18.5)\\n(18.6, 18.6)\\n(18.7, 18.7)\\n(18.8, 18.8)\\n(18.9, 18.9)\\n(19.0, 19.0)\\n(19.1, 19.1)\\n(19.2, 19.2)\\n(19.3, 19.3)\\n(19.4, 19.4)\\n(19.5, 19.5)\\n(19.6, 19.6)\\n(19.7, 19.7)\\n(19.8, 19.8)\\n(19.9, 19.9)\\n(20.0, 20.0)\\n(20.1, 20.1)\\n(20.2, 20.2)\\n(20.3, 20.3)\\n(20.4, 20.4)\\n(20.5, 20.5)\\n(20.6, 20.6)\\n(20.7, 20.7)\\n(20.8, 20.8)\\n(20.9, 20.9)\\n(21.0, 21.0)\\n(21.1, 21.1)\\n(21.2, 21.2)\\n(21.4, 21.4)\\n(21.5, 21.5)\\n(21.6, 21.6)\\n(21.7, 21.7)\\n(21.8, 21.8)\\n(21.9, 21.9)\\n(22.0, 22.0)\\n(22.2, 22.2)\\n(22.3, 22.3)\\n(22.4, 22.4)\\n(22.5, 22.5)\\n(22.6, 22.6)\\n(22.7, 22.7)\\n(22.8, 22.8)\\n(22.9, 22.9)\\n(23.0, 23.0)\\n(23.1, 23.1)\\n(23.2, 23.2)\\n(23.3, 23.3)\\n(23.4, 23.4)\\n(23.5, 23.5)\\n(23.6, 23.6)\\n(23.7, 23.7)\\n(23.8, 23.8)\\n(23.9, 23.9)\\n(24.0, 24.0)\\n(24.1, 24.1)\\n(24.2, 24.2)\\n(24.3, 24.3)\\n(24.4, 24.4)\\n(24.5, 24.5)\\n(24.7, 24.7)\\n(24.8, 24.8)\\n(25.0, 25.0)\\n(25.1, 25.1)\\n(25.2, 25.2)\\n(25.3, 25.3)\\n(26.4, 26.4)\\n(26.6, 26.6)\\n(27.0, 27.0)\\n(27.5, 27.5)\\n(27.9, 27.9)\\n(28.0, 28.0)\\n(28.1, 28.1)\\n(28.2, 28.2)\\n(28.4, 28.4)\\n(28.7, 28.7)\\n(29.0, 29.0)\\n(29.1, 29.1)\\n(29.4, 29.4)\\n(29.6, 29.6)\\n(29.8, 29.8)\\n(29.9, 29.9)\\n(30.1, 30.1)\\n(30.3, 30.3)\\n(30.5, 30.5)\\n(31.0, 31.0)\\n(31.1, 31.1)\\n(31.5, 31.5)\\n(31.6, 31.6)\\n(31.7, 31.7)\\n(32.0, 32.0)\\n(32.2, 32.2)\\n(32.4, 32.4)\\n(32.5, 32.5)\\n(32.7, 32.7)\\n(32.9, 32.9)\\n(33.0, 33.0)\\n(33.1, 33.1)\\n(33.2, 33.2)\\n(33.4, 33.4)\\n(33.8, 33.8)\\n(34.6, 34.6)\\n(34.7, 34.7)\\n(34.9, 34.9)\\n(35.1, 35.1)\\n(35.2, 35.2)\\n(36.0, 36.0)\\n(36.1, 36.1)\\n(36.2, 36.2)\\n(36.4, 36.4)\\n(36.5, 36.5)\\n(37.2, 37.2)\\n(37.3, 37.3)\\n(38.7, 38.7)\\n(39.8, 39.8)\\n(41.7, 41.7)\\n(42.3, 42.3)\\n(42.8, 42.8)\\n(43.1, 43.1)\\n(43.8, 43.8)\\n(44.8, 44.8)\\n(45.4, 45.4)\\n(46.0, 46.0)\\n(48.3, 48.3)\\n(48.5, 48.5)\\n(48.8, 48.8)\\n(50.0, 50.0)\\n};\\n\\\\end{axis}\\n\", \"\", \"\\\\usepackage{pgfplots}\\n\\\\pgfplotsset{compat=newest}\\n\\\\pgfplotsset{every axis legend/.append style={%\\ncells={anchor=west}}\\n}\\n\\\\usepgfplotslibrary{polar}\\n\\\\usetikzlibrary{arrows}\\n\\\\tikzset{>=stealth'}\\n\", true, true)" - ] - }, - "execution_count": 12, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# Plot true values versus predicted values for epsilon-SVR on training set\n", - "epsilonsvr_svmreg_plot_training = PredictMD.plotsinglelabelregressiontrueversuspredicted(\n", - " epsilonsvr_svmreg,\n", - " training_features_df,\n", - " traininglabels_df,\n", - " labelname,\n", - " )" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "metadata": {}, - "outputs": [ - { - "data": { - "image/svg+xml": [ - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - " \n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - " \n", - " \n", - "\n", - "\n", - " \n", - " \n", - "\n", - "\n", - " \n", - " \n", - "\n", - "\n", - " \n", - " \n", - "\n", - "\n", - " \n", - " \n", - "\n", - "\n", - " \n", - "\n", - "\n", - " \n", - " \n", - "\n", - "\n", - " \n", - " \n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - " \n", - "\n", - "\n", - " \n", - " \n", - " \n", - "\n", - "\n", - " \n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "\n", - "\n", - " \n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - "\n", - "\n", - "\n", - "\n" - ], - "text/plain": [ - "TikzPictures.TikzPicture(\"\\\\begin{axis}[ylabel = {Predicted value}, xlabel = {True value}]\\\\addplot+ [only marks = {true}, black,fill=black]coordinates {\\n(10.4, 0.0)\\n(50.0, 0.0)\\n(50.0, 0.0)\\n(15.2, 0.0)\\n(26.2, 0.0)\\n(5.0, 0.0)\\n(7.0, 0.0)\\n(27.1, 0.0)\\n(35.4, 0.0)\\n(17.7, 0.0)\\n(8.8, 0.0)\\n(9.5, 0.0)\\n(31.2, 0.0)\\n(50.0, 0.0)\\n(8.4, 0.0)\\n(14.6, 0.0)\\n(19.7, 0.0)\\n(23.8, 0.0)\\n(20.0, 0.0)\\n(21.2, 0.0)\\n(30.7, 0.0)\\n(15.6, 0.0)\\n(50.0, 0.0)\\n(17.2, 0.0)\\n(19.6, 0.0)\\n(17.2, 0.0)\\n(18.1, 0.0)\\n(25.0, 0.0)\\n(22.2, 0.0)\\n(30.8, 0.0)\\n(37.9, 0.0)\\n(22.8, 0.0)\\n(20.4, 0.0)\\n(15.2, 0.0)\\n(41.3, 0.0)\\n(12.7, 0.0)\\n(20.1, 0.0)\\n(36.2, 0.0)\\n(21.9, 0.0)\\n(16.6, 0.0)\\n(24.4, 0.0)\\n(8.3, 0.0)\\n(35.4, 0.0)\\n(18.0, 0.0)\\n(23.2, 0.0)\\n(24.7, 0.0)\\n(22.9, 0.0)\\n(28.5, 0.0)\\n(8.5, 0.0)\\n(25.0, 0.0)\\n(24.6, 0.0)\\n(13.1, 0.0)\\n(14.3, 0.0)\\n(7.2, 0.0)\\n(28.6, 0.0)\\n(23.4, 0.0)\\n(14.9, 0.0)\\n(10.9, 0.0)\\n(11.0, 0.0)\\n(8.5, 0.0)\\n(28.7, 0.0)\\n(20.0, 0.0)\\n(23.9, 0.0)\\n(15.6, 0.0)\\n(43.5, 0.0)\\n(44.0, 0.0)\\n(22.1, 0.0)\\n(20.5, 0.0)\\n(24.8, 0.0)\\n(22.3, 0.0)\\n(15.0, 0.0)\\n(13.3, 0.0)\\n(11.9, 0.0)\\n(22.0, 0.0)\\n(18.8, 0.0)\\n(18.7, 0.0)\\n(23.1, 0.0)\\n(20.0, 0.0)\\n(21.7, 0.0)\\n(37.6, 0.0)\\n(37.0, 0.0)\\n(24.8, 0.0)\\n(26.7, 0.0)\\n(19.3, 0.0)\\n(16.1, 0.0)\\n(20.3, 0.0)\\n(21.0, 0.0)\\n(10.8, 0.0)\\n(50.0, 0.0)\\n(20.6, 0.0)\\n(22.2, 0.0)\\n(14.4, 0.0)\\n(23.6, 0.0)\\n(18.9, 0.0)\\n(22.0, 0.0)\\n(26.6, 0.0)\\n(20.6, 0.0)\\n(23.9, 0.0)\\n(24.6, 0.0)\\n(6.3, 0.0)\\n(21.7, 0.0)\\n(11.7, 0.0)\\n(33.4, 0.0)\\n(24.3, 0.0)\\n(13.1, 0.0)\\n(26.5, 0.0)\\n(20.9, 0.0)\\n(50.0, 0.0)\\n(23.9, 0.0)\\n(8.7, 0.0)\\n(13.2, 0.0)\\n(18.3, 0.0)\\n(21.8, 0.0)\\n(18.6, 0.0)\\n(19.5, 0.0)\\n(24.0, 0.0)\\n(22.5, 0.0)\\n(26.4, 0.0)\\n(16.8, 0.0)\\n(33.2, 0.0)\\n(21.9, 0.0)\\n(23.9, 0.0)\\n(46.7, 0.0)\\n(17.1, 0.0)\\n(16.5, 0.0)\\n(25.0, 0.0)\\n(20.7, 0.0)\\n(19.8, 0.0)\\n(22.8, 0.0)\\n(18.4, 0.0)\\n(18.5, 0.0)\\n(31.5, 0.0)\\n(13.3, 0.0)\\n(19.9, 0.0)\\n(22.2, 0.0)\\n(16.1, 0.0)\\n(17.8, 0.0)\\n(27.1, 0.0)\\n(23.1, 0.0)\\n(19.1, 0.0)\\n(33.3, 0.0)\\n(11.3, 0.0)\\n(28.7, 0.0)\\n(19.4, 0.0)\\n(19.9, 0.0)\\n(13.8, 0.0)\\n(19.4, 0.0)\\n(20.3, 0.0)\\n(13.8, 0.0)\\n(16.1, 0.0)\\n(18.9, 0.0)\\n(19.0, 0.0)\\n};\\n\\\\addplot+ [mark = {none}, red]coordinates {\\n(5.0, 5.0)\\n(6.3, 6.3)\\n(7.0, 7.0)\\n(7.2, 7.2)\\n(8.3, 8.3)\\n(8.4, 8.4)\\n(8.5, 8.5)\\n(8.7, 8.7)\\n(8.8, 8.8)\\n(9.5, 9.5)\\n(10.4, 10.4)\\n(10.8, 10.8)\\n(10.9, 10.9)\\n(11.0, 11.0)\\n(11.3, 11.3)\\n(11.7, 11.7)\\n(11.9, 11.9)\\n(12.7, 12.7)\\n(13.1, 13.1)\\n(13.2, 13.2)\\n(13.3, 13.3)\\n(13.8, 13.8)\\n(14.3, 14.3)\\n(14.4, 14.4)\\n(14.6, 14.6)\\n(14.9, 14.9)\\n(15.0, 15.0)\\n(15.2, 15.2)\\n(15.6, 15.6)\\n(16.1, 16.1)\\n(16.5, 16.5)\\n(16.6, 16.6)\\n(16.8, 16.8)\\n(17.1, 17.1)\\n(17.2, 17.2)\\n(17.7, 17.7)\\n(17.8, 17.8)\\n(18.0, 18.0)\\n(18.1, 18.1)\\n(18.3, 18.3)\\n(18.4, 18.4)\\n(18.5, 18.5)\\n(18.6, 18.6)\\n(18.7, 18.7)\\n(18.8, 18.8)\\n(18.9, 18.9)\\n(19.0, 19.0)\\n(19.1, 19.1)\\n(19.3, 19.3)\\n(19.4, 19.4)\\n(19.5, 19.5)\\n(19.6, 19.6)\\n(19.7, 19.7)\\n(19.8, 19.8)\\n(19.9, 19.9)\\n(20.0, 20.0)\\n(20.1, 20.1)\\n(20.3, 20.3)\\n(20.4, 20.4)\\n(20.5, 20.5)\\n(20.6, 20.6)\\n(20.7, 20.7)\\n(20.9, 20.9)\\n(21.0, 21.0)\\n(21.2, 21.2)\\n(21.7, 21.7)\\n(21.8, 21.8)\\n(21.9, 21.9)\\n(22.0, 22.0)\\n(22.1, 22.1)\\n(22.2, 22.2)\\n(22.3, 22.3)\\n(22.5, 22.5)\\n(22.8, 22.8)\\n(22.9, 22.9)\\n(23.1, 23.1)\\n(23.2, 23.2)\\n(23.4, 23.4)\\n(23.6, 23.6)\\n(23.8, 23.8)\\n(23.9, 23.9)\\n(24.0, 24.0)\\n(24.3, 24.3)\\n(24.4, 24.4)\\n(24.6, 24.6)\\n(24.7, 24.7)\\n(24.8, 24.8)\\n(25.0, 25.0)\\n(26.2, 26.2)\\n(26.4, 26.4)\\n(26.5, 26.5)\\n(26.6, 26.6)\\n(26.7, 26.7)\\n(27.1, 27.1)\\n(28.5, 28.5)\\n(28.6, 28.6)\\n(28.7, 28.7)\\n(30.7, 30.7)\\n(30.8, 30.8)\\n(31.2, 31.2)\\n(31.5, 31.5)\\n(33.2, 33.2)\\n(33.3, 33.3)\\n(33.4, 33.4)\\n(35.4, 35.4)\\n(36.2, 36.2)\\n(37.0, 37.0)\\n(37.6, 37.6)\\n(37.9, 37.9)\\n(41.3, 41.3)\\n(43.5, 43.5)\\n(44.0, 44.0)\\n(46.7, 46.7)\\n(50.0, 50.0)\\n};\\n\\\\end{axis}\\n\", \"\", \"\\\\usepackage{pgfplots}\\n\\\\pgfplotsset{compat=newest}\\n\\\\pgfplotsset{every axis legend/.append style={%\\ncells={anchor=west}}\\n}\\n\\\\usepgfplotslibrary{polar}\\n\\\\usetikzlibrary{arrows}\\n\\\\tikzset{>=stealth'}\\n\", true, true)" - ] - }, - "execution_count": 13, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# Plot true values versus predicted values for epsilon-SVR on testing set\n", - "epsilonsvr_svmreg_plot_testing = PredictMD.plotsinglelabelregressiontrueversuspredicted(\n", - " epsilonsvr_svmreg,\n", - " testing_features_df,\n", - " testing_labels_df,\n", - " labelname,\n", - " )" - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
metricSVM (epsilon-SVR)
1R^2 (coefficient of determination)-6.30321
" - ], - "text/plain": [ - "1×2 DataFrames.DataFrame\n", - "│ Row │ metric │ SVM (epsilon-SVR) │\n", - "├─────┼────────────────────────────────────┼───────────────────┤\n", - "│ 1 │ R^2 (coefficient of determination) │ -6.30321 │" - ] - }, - "execution_count": 14, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# Evaluate performance of epsilon-SVR on training set\n", - "PredictMD.singlelabelregressionmetrics(\n", - " epsilonsvr_svmreg,\n", - " training_features_df,\n", - " traininglabels_df,\n", - " labelname,\n", - " )" - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
metricSVM (epsilon-SVR)
1R^2 (coefficient of determination)-5.42347
" - ], - "text/plain": [ - "1×2 DataFrames.DataFrame\n", - "│ Row │ metric │ SVM (epsilon-SVR) │\n", - "├─────┼────────────────────────────────────┼───────────────────┤\n", - "│ 1 │ R^2 (coefficient of determination) │ -5.42347 │" - ] - }, - "execution_count": 15, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# Evaluate performance of epsilon-SVR on testing set\n", - "PredictMD.singlelabelregressionmetrics(\n", - " epsilonsvr_svmreg,\n", - " testing_features_df,\n", - " testing_labels_df,\n", - " labelname,\n", - " )" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Support vector machine ($\\nu$-support vector regression)" - ] - }, - { - "cell_type": "code", - "execution_count": 16, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "\u001b[1m\u001b[36mINFO: \u001b[39m\u001b[22m\u001b[36mStarting to train LIBSVM.jl model.\n", - "\u001b[39m\u001b[1m\u001b[36mINFO: \u001b[39m\u001b[22m\u001b[36mFinished training LIBSVM.jl model.\n", - "\u001b[39m" - ] - }, - { - "data": { - "text/plain": [ - "PredictMD.LIBSVMModel(\"SVM (nu-SVR)\", false, true, Float64[], Dict{Any,Any}(Pair{Any,Any}(:gamma, 0.1),Pair{Any,Any}(:svmtype, LIBSVM.NuSVR),Pair{Any,Any}(:degree, 3),Pair{Any,Any}(:tolerance, 0.001),Pair{Any,Any}(:nu, 0.5),Pair{Any,Any}(:cachesize, 100.0),Pair{Any,Any}(:epsilon, 0.1),Pair{Any,Any}(:kernel, Linear),Pair{Any,Any}(:verbose, false),Pair{Any,Any}(:cost, 1.0)…), LIBSVM.SVM{Float64}(LIBSVM.NuSVR, Linear::LIBSVM.Kernel.KERNEL = 0, nothing, 13, 2, Float64[], Int32[], Float64[], Int32[], LIBSVM.SupportVectors{Union{Float64, Missings.Missing},Float64}(0, Int32[], Union{Float64, Missings.Missing}[], Array{Float64}(13,0), Int32[], LIBSVM.SVMNode[]), 0.0, Array{Float64}(0,1), Float64[], Float64[], [0.0], 3, 0.1, 100.0, 0.001, 1.0, 0.5, 0.1, true, false))" - ] - }, - "execution_count": 16, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# Set up nu-SVR model\n", - "nusvr_svmreg = PredictMD.singlelabeldataframesvmregression(\n", - " featurenames,\n", - " labelname;\n", - " package = :LIBSVMjl,\n", - " svmtype = LIBSVM.NuSVR,\n", - " name = \"SVM (nu-SVR)\",\n", - " kernel = LIBSVM.Kernel.Linear,\n", - " verbose = false,\n", - " )\n", - "\n", - "if load_pretrained\n", - " PredictMD.load!(nusvr_svmreg_filename, nusvr_svmreg)\n", - "else\n", - " # set feature contrasts\n", - " PredictMD.set_feature_contrasts!(nusvr_svmreg, contrasts)\n", - " # Train nu-SVR model\n", - " PredictMD.fit!(nusvr_svmreg,training_features_df,traininglabels_df,)\n", - "end" - ] - }, - { - "cell_type": "code", - "execution_count": 17, - "metadata": {}, - "outputs": [ - { - "data": { - "image/svg+xml": [ - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - " \n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - " \n", - " \n", - "\n", - "\n", - " \n", - " \n", - "\n", - "\n", - " \n", - " \n", - "\n", - "\n", - " \n", - " \n", - "\n", - "\n", - " \n", - " \n", - "\n", - "\n", - " \n", - "\n", - "\n", - " \n", - " \n", - "\n", - "\n", - " \n", - " \n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - " \n", - "\n", - "\n", - " \n", - " \n", - " \n", - "\n", - "\n", - " \n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "\n", - "\n", - " \n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - "\n", - "\n", - "\n", - "\n" - ], - "text/plain": [ - "TikzPictures.TikzPicture(\"\\\\begin{axis}[ylabel = {Predicted value}, xlabel = {True value}]\\\\addplot+ [only marks = {true}, black,fill=black]coordinates {\\n(17.3, 0.0)\\n(18.5, 0.0)\\n(14.9, 0.0)\\n(31.1, 0.0)\\n(12.6, 0.0)\\n(29.6, 0.0)\\n(23.1, 0.0)\\n(19.1, 0.0)\\n(7.0, 0.0)\\n(19.4, 0.0)\\n(21.6, 0.0)\\n(14.2, 0.0)\\n(29.6, 0.0)\\n(21.8, 0.0)\\n(11.9, 0.0)\\n(12.1, 0.0)\\n(50.0, 0.0)\\n(22.2, 0.0)\\n(22.9, 0.0)\\n(48.3, 0.0)\\n(21.5, 0.0)\\n(21.6, 0.0)\\n(36.5, 0.0)\\n(19.6, 0.0)\\n(19.9, 0.0)\\n(16.3, 0.0)\\n(21.4, 0.0)\\n(19.1, 0.0)\\n(26.6, 0.0)\\n(37.2, 0.0)\\n(12.5, 0.0)\\n(20.4, 0.0)\\n(22.6, 0.0)\\n(28.2, 0.0)\\n(13.4, 0.0)\\n(23.3, 0.0)\\n(42.8, 0.0)\\n(21.2, 0.0)\\n(22.9, 0.0)\\n(50.0, 0.0)\\n(15.1, 0.0)\\n(19.5, 0.0)\\n(21.7, 0.0)\\n(23.3, 0.0)\\n(14.1, 0.0)\\n(20.5, 0.0)\\n(14.0, 0.0)\\n(43.1, 0.0)\\n(45.4, 0.0)\\n(36.4, 0.0)\\n(23.7, 0.0)\\n(31.5, 0.0)\\n(13.6, 0.0)\\n(16.4, 0.0)\\n(15.2, 0.0)\\n(27.9, 0.0)\\n(23.0, 0.0)\\n(19.8, 0.0)\\n(35.1, 0.0)\\n(17.8, 0.0)\\n(36.0, 0.0)\\n(18.4, 0.0)\\n(10.5, 0.0)\\n(50.0, 0.0)\\n(20.1, 0.0)\\n(23.9, 0.0)\\n(48.8, 0.0)\\n(19.3, 0.0)\\n(14.5, 0.0)\\n(24.2, 0.0)\\n(29.8, 0.0)\\n(23.3, 0.0)\\n(14.5, 0.0)\\n(15.6, 0.0)\\n(8.8, 0.0)\\n(19.6, 0.0)\\n(21.9, 0.0)\\n(15.0, 0.0)\\n(32.0, 0.0)\\n(20.1, 0.0)\\n(7.2, 0.0)\\n(23.5, 0.0)\\n(33.0, 0.0)\\n(26.6, 0.0)\\n(5.0, 0.0)\\n(17.1, 0.0)\\n(16.8, 0.0)\\n(29.9, 0.0)\\n(19.3, 0.0)\\n(20.3, 0.0)\\n(29.1, 0.0)\\n(27.5, 0.0)\\n(10.9, 0.0)\\n(24.4, 0.0)\\n(22.5, 0.0)\\n(14.8, 0.0)\\n(18.2, 0.0)\\n(24.8, 0.0)\\n(18.5, 0.0)\\n(23.8, 0.0)\\n(17.6, 0.0)\\n(24.0, 0.0)\\n(17.5, 0.0)\\n(23.2, 0.0)\\n(20.8, 0.0)\\n(15.0, 0.0)\\n(24.1, 0.0)\\n(18.9, 0.0)\\n(25.0, 0.0)\\n(31.6, 0.0)\\n(16.0, 0.0)\\n(30.5, 0.0)\\n(32.4, 0.0)\\n(23.3, 0.0)\\n(33.1, 0.0)\\n(50.0, 0.0)\\n(19.2, 0.0)\\n(23.2, 0.0)\\n(27.5, 0.0)\\n(23.0, 0.0)\\n(8.4, 0.0)\\n(17.4, 0.0)\\n(23.8, 0.0)\\n(22.9, 0.0)\\n(22.4, 0.0)\\n(19.6, 0.0)\\n(31.0, 0.0)\\n(41.7, 0.0)\\n(27.9, 0.0)\\n(24.4, 0.0)\\n(36.1, 0.0)\\n(15.3, 0.0)\\n(16.6, 0.0)\\n(20.6, 0.0)\\n(32.0, 0.0)\\n(24.5, 0.0)\\n(23.1, 0.0)\\n(50.0, 0.0)\\n(12.0, 0.0)\\n(18.2, 0.0)\\n(19.4, 0.0)\\n(11.5, 0.0)\\n(14.1, 0.0)\\n(44.8, 0.0)\\n(17.1, 0.0)\\n(8.1, 0.0)\\n(28.0, 0.0)\\n(15.7, 0.0)\\n(23.7, 0.0)\\n(36.2, 0.0)\\n(19.3, 0.0)\\n(21.4, 0.0)\\n(29.1, 0.0)\\n(16.7, 0.0)\\n(34.9, 0.0)\\n(26.4, 0.0)\\n(20.6, 0.0)\\n(17.0, 0.0)\\n(14.5, 0.0)\\n(33.4, 0.0)\\n(19.8, 0.0)\\n(23.0, 0.0)\\n(15.6, 0.0)\\n(22.5, 0.0)\\n(21.2, 0.0)\\n(20.5, 0.0)\\n(23.7, 0.0)\\n(20.9, 0.0)\\n(17.9, 0.0)\\n(10.2, 0.0)\\n(24.7, 0.0)\\n(30.3, 0.0)\\n(13.8, 0.0)\\n(13.4, 0.0)\\n(23.6, 0.0)\\n(23.1, 0.0)\\n(22.7, 0.0)\\n(21.7, 0.0)\\n(14.1, 0.0)\\n(27.5, 0.0)\\n(18.7, 0.0)\\n(34.6, 0.0)\\n(14.9, 0.0)\\n(12.8, 0.0)\\n(11.8, 0.0)\\n(25.0, 0.0)\\n(16.2, 0.0)\\n(19.2, 0.0)\\n(29.8, 0.0)\\n(14.4, 0.0)\\n(28.4, 0.0)\\n(20.6, 0.0)\\n(25.0, 0.0)\\n(20.4, 0.0)\\n(24.1, 0.0)\\n(7.4, 0.0)\\n(10.4, 0.0)\\n(20.8, 0.0)\\n(13.8, 0.0)\\n(42.3, 0.0)\\n(20.0, 0.0)\\n(22.2, 0.0)\\n(50.0, 0.0)\\n(25.3, 0.0)\\n(48.5, 0.0)\\n(23.2, 0.0)\\n(21.0, 0.0)\\n(31.7, 0.0)\\n(23.1, 0.0)\\n(20.7, 0.0)\\n(30.1, 0.0)\\n(17.4, 0.0)\\n(22.0, 0.0)\\n(16.5, 0.0)\\n(50.0, 0.0)\\n(38.7, 0.0)\\n(17.4, 0.0)\\n(11.7, 0.0)\\n(19.5, 0.0)\\n(21.4, 0.0)\\n(18.5, 0.0)\\n(20.3, 0.0)\\n(13.0, 0.0)\\n(15.4, 0.0)\\n(21.7, 0.0)\\n(20.1, 0.0)\\n(21.1, 0.0)\\n(28.7, 0.0)\\n(19.0, 0.0)\\n(30.1, 0.0)\\n(22.6, 0.0)\\n(23.4, 0.0)\\n(16.7, 0.0)\\n(13.1, 0.0)\\n(24.3, 0.0)\\n(20.4, 0.0)\\n(10.2, 0.0)\\n(19.3, 0.0)\\n(33.2, 0.0)\\n(37.3, 0.0)\\n(22.6, 0.0)\\n(21.4, 0.0)\\n(33.1, 0.0)\\n(17.5, 0.0)\\n(8.3, 0.0)\\n(22.0, 0.0)\\n(13.1, 0.0)\\n(16.2, 0.0)\\n(50.0, 0.0)\\n(13.4, 0.0)\\n(21.7, 0.0)\\n(19.7, 0.0)\\n(19.6, 0.0)\\n(17.8, 0.0)\\n(22.6, 0.0)\\n(50.0, 0.0)\\n(17.8, 0.0)\\n(21.5, 0.0)\\n(20.1, 0.0)\\n(18.2, 0.0)\\n(9.6, 0.0)\\n(12.3, 0.0)\\n(22.6, 0.0)\\n(17.5, 0.0)\\n(17.8, 0.0)\\n(25.2, 0.0)\\n(13.8, 0.0)\\n(13.9, 0.0)\\n(20.2, 0.0)\\n(19.4, 0.0)\\n(13.3, 0.0)\\n(23.8, 0.0)\\n(13.6, 0.0)\\n(25.0, 0.0)\\n(13.9, 0.0)\\n(20.2, 0.0)\\n(32.5, 0.0)\\n(23.1, 0.0)\\n(18.4, 0.0)\\n(24.4, 0.0)\\n(13.4, 0.0)\\n(34.9, 0.0)\\n(30.1, 0.0)\\n(22.0, 0.0)\\n(29.0, 0.0)\\n(21.7, 0.0)\\n(13.5, 0.0)\\n(11.8, 0.0)\\n(18.8, 0.0)\\n(34.7, 0.0)\\n(34.9, 0.0)\\n(24.3, 0.0)\\n(28.4, 0.0)\\n(33.8, 0.0)\\n(18.9, 0.0)\\n(28.1, 0.0)\\n(21.2, 0.0)\\n(22.8, 0.0)\\n(18.7, 0.0)\\n(32.9, 0.0)\\n(10.2, 0.0)\\n(39.8, 0.0)\\n(50.0, 0.0)\\n(12.7, 0.0)\\n(20.8, 0.0)\\n(21.4, 0.0)\\n(18.6, 0.0)\\n(27.5, 0.0)\\n(24.1, 0.0)\\n(22.8, 0.0)\\n(14.6, 0.0)\\n(15.6, 0.0)\\n(25.1, 0.0)\\n(22.0, 0.0)\\n(17.2, 0.0)\\n(32.7, 0.0)\\n(13.5, 0.0)\\n(31.6, 0.0)\\n(20.0, 0.0)\\n(19.4, 0.0)\\n(22.7, 0.0)\\n(23.7, 0.0)\\n(20.6, 0.0)\\n(29.0, 0.0)\\n(24.8, 0.0)\\n(10.5, 0.0)\\n(9.7, 0.0)\\n(43.8, 0.0)\\n(24.7, 0.0)\\n(22.0, 0.0)\\n(21.0, 0.0)\\n(12.7, 0.0)\\n(24.5, 0.0)\\n(15.4, 0.0)\\n(27.0, 0.0)\\n(18.3, 0.0)\\n(7.2, 0.0)\\n(14.3, 0.0)\\n(19.1, 0.0)\\n(24.5, 0.0)\\n(19.5, 0.0)\\n(7.5, 0.0)\\n(5.6, 0.0)\\n(35.2, 0.0)\\n(32.2, 0.0)\\n(22.3, 0.0)\\n(25.0, 0.0)\\n(29.4, 0.0)\\n(23.0, 0.0)\\n(22.4, 0.0)\\n(46.0, 0.0)\\n(19.9, 0.0)\\n(21.1, 0.0)\\n(21.2, 0.0)\\n};\\n\\\\addplot+ [mark = {none}, red]coordinates {\\n(5.0, 5.0)\\n(5.6, 5.6)\\n(7.0, 7.0)\\n(7.2, 7.2)\\n(7.4, 7.4)\\n(7.5, 7.5)\\n(8.1, 8.1)\\n(8.3, 8.3)\\n(8.4, 8.4)\\n(8.8, 8.8)\\n(9.6, 9.6)\\n(9.7, 9.7)\\n(10.2, 10.2)\\n(10.4, 10.4)\\n(10.5, 10.5)\\n(10.9, 10.9)\\n(11.5, 11.5)\\n(11.7, 11.7)\\n(11.8, 11.8)\\n(11.9, 11.9)\\n(12.0, 12.0)\\n(12.1, 12.1)\\n(12.3, 12.3)\\n(12.5, 12.5)\\n(12.6, 12.6)\\n(12.7, 12.7)\\n(12.8, 12.8)\\n(13.0, 13.0)\\n(13.1, 13.1)\\n(13.3, 13.3)\\n(13.4, 13.4)\\n(13.5, 13.5)\\n(13.6, 13.6)\\n(13.8, 13.8)\\n(13.9, 13.9)\\n(14.0, 14.0)\\n(14.1, 14.1)\\n(14.2, 14.2)\\n(14.3, 14.3)\\n(14.4, 14.4)\\n(14.5, 14.5)\\n(14.6, 14.6)\\n(14.8, 14.8)\\n(14.9, 14.9)\\n(15.0, 15.0)\\n(15.1, 15.1)\\n(15.2, 15.2)\\n(15.3, 15.3)\\n(15.4, 15.4)\\n(15.6, 15.6)\\n(15.7, 15.7)\\n(16.0, 16.0)\\n(16.2, 16.2)\\n(16.3, 16.3)\\n(16.4, 16.4)\\n(16.5, 16.5)\\n(16.6, 16.6)\\n(16.7, 16.7)\\n(16.8, 16.8)\\n(17.0, 17.0)\\n(17.1, 17.1)\\n(17.2, 17.2)\\n(17.3, 17.3)\\n(17.4, 17.4)\\n(17.5, 17.5)\\n(17.6, 17.6)\\n(17.8, 17.8)\\n(17.9, 17.9)\\n(18.2, 18.2)\\n(18.3, 18.3)\\n(18.4, 18.4)\\n(18.5, 18.5)\\n(18.6, 18.6)\\n(18.7, 18.7)\\n(18.8, 18.8)\\n(18.9, 18.9)\\n(19.0, 19.0)\\n(19.1, 19.1)\\n(19.2, 19.2)\\n(19.3, 19.3)\\n(19.4, 19.4)\\n(19.5, 19.5)\\n(19.6, 19.6)\\n(19.7, 19.7)\\n(19.8, 19.8)\\n(19.9, 19.9)\\n(20.0, 20.0)\\n(20.1, 20.1)\\n(20.2, 20.2)\\n(20.3, 20.3)\\n(20.4, 20.4)\\n(20.5, 20.5)\\n(20.6, 20.6)\\n(20.7, 20.7)\\n(20.8, 20.8)\\n(20.9, 20.9)\\n(21.0, 21.0)\\n(21.1, 21.1)\\n(21.2, 21.2)\\n(21.4, 21.4)\\n(21.5, 21.5)\\n(21.6, 21.6)\\n(21.7, 21.7)\\n(21.8, 21.8)\\n(21.9, 21.9)\\n(22.0, 22.0)\\n(22.2, 22.2)\\n(22.3, 22.3)\\n(22.4, 22.4)\\n(22.5, 22.5)\\n(22.6, 22.6)\\n(22.7, 22.7)\\n(22.8, 22.8)\\n(22.9, 22.9)\\n(23.0, 23.0)\\n(23.1, 23.1)\\n(23.2, 23.2)\\n(23.3, 23.3)\\n(23.4, 23.4)\\n(23.5, 23.5)\\n(23.6, 23.6)\\n(23.7, 23.7)\\n(23.8, 23.8)\\n(23.9, 23.9)\\n(24.0, 24.0)\\n(24.1, 24.1)\\n(24.2, 24.2)\\n(24.3, 24.3)\\n(24.4, 24.4)\\n(24.5, 24.5)\\n(24.7, 24.7)\\n(24.8, 24.8)\\n(25.0, 25.0)\\n(25.1, 25.1)\\n(25.2, 25.2)\\n(25.3, 25.3)\\n(26.4, 26.4)\\n(26.6, 26.6)\\n(27.0, 27.0)\\n(27.5, 27.5)\\n(27.9, 27.9)\\n(28.0, 28.0)\\n(28.1, 28.1)\\n(28.2, 28.2)\\n(28.4, 28.4)\\n(28.7, 28.7)\\n(29.0, 29.0)\\n(29.1, 29.1)\\n(29.4, 29.4)\\n(29.6, 29.6)\\n(29.8, 29.8)\\n(29.9, 29.9)\\n(30.1, 30.1)\\n(30.3, 30.3)\\n(30.5, 30.5)\\n(31.0, 31.0)\\n(31.1, 31.1)\\n(31.5, 31.5)\\n(31.6, 31.6)\\n(31.7, 31.7)\\n(32.0, 32.0)\\n(32.2, 32.2)\\n(32.4, 32.4)\\n(32.5, 32.5)\\n(32.7, 32.7)\\n(32.9, 32.9)\\n(33.0, 33.0)\\n(33.1, 33.1)\\n(33.2, 33.2)\\n(33.4, 33.4)\\n(33.8, 33.8)\\n(34.6, 34.6)\\n(34.7, 34.7)\\n(34.9, 34.9)\\n(35.1, 35.1)\\n(35.2, 35.2)\\n(36.0, 36.0)\\n(36.1, 36.1)\\n(36.2, 36.2)\\n(36.4, 36.4)\\n(36.5, 36.5)\\n(37.2, 37.2)\\n(37.3, 37.3)\\n(38.7, 38.7)\\n(39.8, 39.8)\\n(41.7, 41.7)\\n(42.3, 42.3)\\n(42.8, 42.8)\\n(43.1, 43.1)\\n(43.8, 43.8)\\n(44.8, 44.8)\\n(45.4, 45.4)\\n(46.0, 46.0)\\n(48.3, 48.3)\\n(48.5, 48.5)\\n(48.8, 48.8)\\n(50.0, 50.0)\\n};\\n\\\\end{axis}\\n\", \"\", \"\\\\usepackage{pgfplots}\\n\\\\pgfplotsset{compat=newest}\\n\\\\pgfplotsset{every axis legend/.append style={%\\ncells={anchor=west}}\\n}\\n\\\\usepgfplotslibrary{polar}\\n\\\\usetikzlibrary{arrows}\\n\\\\tikzset{>=stealth'}\\n\", true, true)" - ] - }, - "execution_count": 17, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# Plot true values versus predicted values for nu-SVR on training set\n", - "nusvr_svmreg_plot_training = PredictMD.plotsinglelabelregressiontrueversuspredicted(\n", - " nusvr_svmreg,\n", - " training_features_df,\n", - " traininglabels_df,\n", - " labelname,\n", - " )" - ] - }, - { - "cell_type": "code", - "execution_count": 18, - "metadata": {}, - "outputs": [ - { - "data": { - "image/svg+xml": [ - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - " \n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - " \n", - " \n", - "\n", - "\n", - " \n", - " \n", - "\n", - "\n", - " \n", - " \n", - "\n", - "\n", - " \n", - " \n", - "\n", - "\n", - " \n", - " \n", - "\n", - "\n", - " \n", - "\n", - "\n", - " \n", - " \n", - "\n", - "\n", - " \n", - " \n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - " \n", - "\n", - "\n", - " \n", - " \n", - " \n", - "\n", - "\n", - " \n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "\n", - "\n", - " \n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - "\n", - "\n", - "\n", - "\n" - ], - "text/plain": [ - "TikzPictures.TikzPicture(\"\\\\begin{axis}[ylabel = {Predicted value}, xlabel = {True value}]\\\\addplot+ [only marks = {true}, black,fill=black]coordinates {\\n(10.4, 0.0)\\n(50.0, 0.0)\\n(50.0, 0.0)\\n(15.2, 0.0)\\n(26.2, 0.0)\\n(5.0, 0.0)\\n(7.0, 0.0)\\n(27.1, 0.0)\\n(35.4, 0.0)\\n(17.7, 0.0)\\n(8.8, 0.0)\\n(9.5, 0.0)\\n(31.2, 0.0)\\n(50.0, 0.0)\\n(8.4, 0.0)\\n(14.6, 0.0)\\n(19.7, 0.0)\\n(23.8, 0.0)\\n(20.0, 0.0)\\n(21.2, 0.0)\\n(30.7, 0.0)\\n(15.6, 0.0)\\n(50.0, 0.0)\\n(17.2, 0.0)\\n(19.6, 0.0)\\n(17.2, 0.0)\\n(18.1, 0.0)\\n(25.0, 0.0)\\n(22.2, 0.0)\\n(30.8, 0.0)\\n(37.9, 0.0)\\n(22.8, 0.0)\\n(20.4, 0.0)\\n(15.2, 0.0)\\n(41.3, 0.0)\\n(12.7, 0.0)\\n(20.1, 0.0)\\n(36.2, 0.0)\\n(21.9, 0.0)\\n(16.6, 0.0)\\n(24.4, 0.0)\\n(8.3, 0.0)\\n(35.4, 0.0)\\n(18.0, 0.0)\\n(23.2, 0.0)\\n(24.7, 0.0)\\n(22.9, 0.0)\\n(28.5, 0.0)\\n(8.5, 0.0)\\n(25.0, 0.0)\\n(24.6, 0.0)\\n(13.1, 0.0)\\n(14.3, 0.0)\\n(7.2, 0.0)\\n(28.6, 0.0)\\n(23.4, 0.0)\\n(14.9, 0.0)\\n(10.9, 0.0)\\n(11.0, 0.0)\\n(8.5, 0.0)\\n(28.7, 0.0)\\n(20.0, 0.0)\\n(23.9, 0.0)\\n(15.6, 0.0)\\n(43.5, 0.0)\\n(44.0, 0.0)\\n(22.1, 0.0)\\n(20.5, 0.0)\\n(24.8, 0.0)\\n(22.3, 0.0)\\n(15.0, 0.0)\\n(13.3, 0.0)\\n(11.9, 0.0)\\n(22.0, 0.0)\\n(18.8, 0.0)\\n(18.7, 0.0)\\n(23.1, 0.0)\\n(20.0, 0.0)\\n(21.7, 0.0)\\n(37.6, 0.0)\\n(37.0, 0.0)\\n(24.8, 0.0)\\n(26.7, 0.0)\\n(19.3, 0.0)\\n(16.1, 0.0)\\n(20.3, 0.0)\\n(21.0, 0.0)\\n(10.8, 0.0)\\n(50.0, 0.0)\\n(20.6, 0.0)\\n(22.2, 0.0)\\n(14.4, 0.0)\\n(23.6, 0.0)\\n(18.9, 0.0)\\n(22.0, 0.0)\\n(26.6, 0.0)\\n(20.6, 0.0)\\n(23.9, 0.0)\\n(24.6, 0.0)\\n(6.3, 0.0)\\n(21.7, 0.0)\\n(11.7, 0.0)\\n(33.4, 0.0)\\n(24.3, 0.0)\\n(13.1, 0.0)\\n(26.5, 0.0)\\n(20.9, 0.0)\\n(50.0, 0.0)\\n(23.9, 0.0)\\n(8.7, 0.0)\\n(13.2, 0.0)\\n(18.3, 0.0)\\n(21.8, 0.0)\\n(18.6, 0.0)\\n(19.5, 0.0)\\n(24.0, 0.0)\\n(22.5, 0.0)\\n(26.4, 0.0)\\n(16.8, 0.0)\\n(33.2, 0.0)\\n(21.9, 0.0)\\n(23.9, 0.0)\\n(46.7, 0.0)\\n(17.1, 0.0)\\n(16.5, 0.0)\\n(25.0, 0.0)\\n(20.7, 0.0)\\n(19.8, 0.0)\\n(22.8, 0.0)\\n(18.4, 0.0)\\n(18.5, 0.0)\\n(31.5, 0.0)\\n(13.3, 0.0)\\n(19.9, 0.0)\\n(22.2, 0.0)\\n(16.1, 0.0)\\n(17.8, 0.0)\\n(27.1, 0.0)\\n(23.1, 0.0)\\n(19.1, 0.0)\\n(33.3, 0.0)\\n(11.3, 0.0)\\n(28.7, 0.0)\\n(19.4, 0.0)\\n(19.9, 0.0)\\n(13.8, 0.0)\\n(19.4, 0.0)\\n(20.3, 0.0)\\n(13.8, 0.0)\\n(16.1, 0.0)\\n(18.9, 0.0)\\n(19.0, 0.0)\\n};\\n\\\\addplot+ [mark = {none}, red]coordinates {\\n(5.0, 5.0)\\n(6.3, 6.3)\\n(7.0, 7.0)\\n(7.2, 7.2)\\n(8.3, 8.3)\\n(8.4, 8.4)\\n(8.5, 8.5)\\n(8.7, 8.7)\\n(8.8, 8.8)\\n(9.5, 9.5)\\n(10.4, 10.4)\\n(10.8, 10.8)\\n(10.9, 10.9)\\n(11.0, 11.0)\\n(11.3, 11.3)\\n(11.7, 11.7)\\n(11.9, 11.9)\\n(12.7, 12.7)\\n(13.1, 13.1)\\n(13.2, 13.2)\\n(13.3, 13.3)\\n(13.8, 13.8)\\n(14.3, 14.3)\\n(14.4, 14.4)\\n(14.6, 14.6)\\n(14.9, 14.9)\\n(15.0, 15.0)\\n(15.2, 15.2)\\n(15.6, 15.6)\\n(16.1, 16.1)\\n(16.5, 16.5)\\n(16.6, 16.6)\\n(16.8, 16.8)\\n(17.1, 17.1)\\n(17.2, 17.2)\\n(17.7, 17.7)\\n(17.8, 17.8)\\n(18.0, 18.0)\\n(18.1, 18.1)\\n(18.3, 18.3)\\n(18.4, 18.4)\\n(18.5, 18.5)\\n(18.6, 18.6)\\n(18.7, 18.7)\\n(18.8, 18.8)\\n(18.9, 18.9)\\n(19.0, 19.0)\\n(19.1, 19.1)\\n(19.3, 19.3)\\n(19.4, 19.4)\\n(19.5, 19.5)\\n(19.6, 19.6)\\n(19.7, 19.7)\\n(19.8, 19.8)\\n(19.9, 19.9)\\n(20.0, 20.0)\\n(20.1, 20.1)\\n(20.3, 20.3)\\n(20.4, 20.4)\\n(20.5, 20.5)\\n(20.6, 20.6)\\n(20.7, 20.7)\\n(20.9, 20.9)\\n(21.0, 21.0)\\n(21.2, 21.2)\\n(21.7, 21.7)\\n(21.8, 21.8)\\n(21.9, 21.9)\\n(22.0, 22.0)\\n(22.1, 22.1)\\n(22.2, 22.2)\\n(22.3, 22.3)\\n(22.5, 22.5)\\n(22.8, 22.8)\\n(22.9, 22.9)\\n(23.1, 23.1)\\n(23.2, 23.2)\\n(23.4, 23.4)\\n(23.6, 23.6)\\n(23.8, 23.8)\\n(23.9, 23.9)\\n(24.0, 24.0)\\n(24.3, 24.3)\\n(24.4, 24.4)\\n(24.6, 24.6)\\n(24.7, 24.7)\\n(24.8, 24.8)\\n(25.0, 25.0)\\n(26.2, 26.2)\\n(26.4, 26.4)\\n(26.5, 26.5)\\n(26.6, 26.6)\\n(26.7, 26.7)\\n(27.1, 27.1)\\n(28.5, 28.5)\\n(28.6, 28.6)\\n(28.7, 28.7)\\n(30.7, 30.7)\\n(30.8, 30.8)\\n(31.2, 31.2)\\n(31.5, 31.5)\\n(33.2, 33.2)\\n(33.3, 33.3)\\n(33.4, 33.4)\\n(35.4, 35.4)\\n(36.2, 36.2)\\n(37.0, 37.0)\\n(37.6, 37.6)\\n(37.9, 37.9)\\n(41.3, 41.3)\\n(43.5, 43.5)\\n(44.0, 44.0)\\n(46.7, 46.7)\\n(50.0, 50.0)\\n};\\n\\\\end{axis}\\n\", \"\", \"\\\\usepackage{pgfplots}\\n\\\\pgfplotsset{compat=newest}\\n\\\\pgfplotsset{every axis legend/.append style={%\\ncells={anchor=west}}\\n}\\n\\\\usepgfplotslibrary{polar}\\n\\\\usetikzlibrary{arrows}\\n\\\\tikzset{>=stealth'}\\n\", true, true)" - ] - }, - "execution_count": 18, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# Plot true values versus predicted values for nu-SVR on testing set\n", - "nusvr_svmreg_plot_testing = PredictMD.plotsinglelabelregressiontrueversuspredicted(\n", - " nusvr_svmreg,\n", - " testing_features_df,\n", - " testing_labels_df,\n", - " labelname,\n", - " )" - ] - }, - { - "cell_type": "code", - "execution_count": 19, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
metricSVM (nu-SVR)
1R^2 (coefficient of determination)-6.30321
" - ], - "text/plain": [ - "1×2 DataFrames.DataFrame\n", - "│ Row │ metric │ SVM (nu-SVR) │\n", - "├─────┼────────────────────────────────────┼──────────────┤\n", - "│ 1 │ R^2 (coefficient of determination) │ -6.30321 │" - ] - }, - "execution_count": 19, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# Evaluate performance of nu-SVR on training set\n", - "PredictMD.singlelabelregressionmetrics(\n", - " nusvr_svmreg,\n", - " training_features_df,\n", - " traininglabels_df,\n", - " labelname,\n", - " )" - ] - }, - { - "cell_type": "code", - "execution_count": 20, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
metricSVM (nu-SVR)
1R^2 (coefficient of determination)-5.42347
" - ], - "text/plain": [ - "1×2 DataFrames.DataFrame\n", - "│ Row │ metric │ SVM (nu-SVR) │\n", - "├─────┼────────────────────────────────────┼──────────────┤\n", - "│ 1 │ R^2 (coefficient of determination) │ -5.42347 │" - ] - }, - "execution_count": 20, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# Evaluate performance of nu-SVR on testing set\n", - "PredictMD.singlelabelregressionmetrics(\n", - " nusvr_svmreg,\n", - " testing_features_df,\n", - " testing_labels_df,\n", - " labelname,\n", - " )" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Section 4: Save trained models to file (if desired) " - ] - }, - { - "cell_type": "code", - "execution_count": 21, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "\u001b[1m\u001b[36mINFO: \u001b[39m\u001b[22m\u001b[36mSaved model to file ./epsilonsvr_svmreg.jld2\n", - "\u001b[39m\u001b[1m\u001b[36mINFO: \u001b[39m\u001b[22m\u001b[36mSaved model to file ./nusvr_svmreg.jld2\n", - "\u001b[39m" - ] - } - ], - "source": [ - "if save_trained\n", - " PredictMD.save(epsilonsvr_svmreg_filename, epsilonsvr_svmreg)\n", - " PredictMD.save(nusvr_svmreg_filename, nusvr_svmreg)\n", - "end" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Appendix A: Directly access the output of regression models " - ] - }, - { - "cell_type": "code", - "execution_count": 22, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
MedV
10.0
20.0
30.0
40.0
50.0
60.0
70.0
80.0
90.0
100.0
110.0
120.0
130.0
140.0
150.0
160.0
170.0
180.0
190.0
200.0
210.0
220.0
230.0
240.0
250.0
260.0
270.0
280.0
290.0
300.0
" - ], - "text/plain": [ - "152×1 DataFrames.DataFrame\n", - "│ Row │ MedV │\n", - "├─────┼──────┤\n", - "│ 1 │ 0.0 │\n", - "│ 2 │ 0.0 │\n", - "│ 3 │ 0.0 │\n", - "│ 4 │ 0.0 │\n", - "│ 5 │ 0.0 │\n", - "│ 6 │ 0.0 │\n", - "│ 7 │ 0.0 │\n", - "│ 8 │ 0.0 │\n", - "│ 9 │ 0.0 │\n", - "│ 10 │ 0.0 │\n", - "│ 11 │ 0.0 │\n", - "⋮\n", - "│ 141 │ 0.0 │\n", - "│ 142 │ 0.0 │\n", - "│ 143 │ 0.0 │\n", - "│ 144 │ 0.0 │\n", - "│ 145 │ 0.0 │\n", - "│ 146 │ 0.0 │\n", - "│ 147 │ 0.0 │\n", - "│ 148 │ 0.0 │\n", - "│ 149 │ 0.0 │\n", - "│ 150 │ 0.0 │\n", - "│ 151 │ 0.0 │\n", - "│ 152 │ 0.0 │" - ] - }, - "execution_count": 22, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# We can use the PredictMD.predict() function to get the real-valued predictions\n", - "# output by each of regression models.\n", - "\n", - "# Get real-valued predictions from each model for training set\n", - "PredictMD.predict(epsilonsvr_svmreg,training_features_df)\n", - "PredictMD.predict(nusvr_svmreg,training_features_df)\n", - "\n", - "# Get real-valued predictions from each model for testing set\n", - "PredictMD.predict(epsilonsvr_svmreg,testing_features_df)\n", - "PredictMD.predict(nusvr_svmreg,testing_features_df)" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Julia 0.6.2", - "language": "julia", - "name": "julia-0.6" - }, - "language_info": { - "file_extension": ".jl", - "mimetype": "application/julia", - "name": "julia", - "version": "0.6.2" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/examples_old/boston_housing/boston_housing_svm.jl b/examples_old/boston_housing/boston_housing_svm.jl deleted file mode 100644 index fcf9b1633..000000000 --- a/examples_old/boston_housing/boston_housing_svm.jl +++ /dev/null @@ -1,200 +0,0 @@ - -# import required packages -import PredictMD -import CSV -import DataFrames -import GZip -import Knet -import LIBSVM -import StatsBase - -# set the seed of the global random number generator -# this makes the results reproducible -srand(999) - -load_pretrained = false -save_trained = true - -# load_pretrained = true -# save_trained = false - -epsilonsvr_svmreg_filename = "./epsilonsvr_svmreg.jld2" -nusvr_svmreg_filename = "./nusvr_svmreg.jld2" - -# Import Boston housing data -df = CSV.read( - GZip.gzopen(joinpath(Pkg.dir("RDatasets"),"data","MASS","Boston.csv.gz")), - DataFrames.DataFrame, - ) - -#take a quick look at file header and few rows -DataFrames.head(df) - -# Remove rows with missing data -DataFrames.dropmissing!(df) - -# Shuffle rows -PredictMD.shuffle_rows!(df) - -# Define labels -categoricalfeaturenames = Symbol[] - -continuousfeaturenames = Symbol[ - :Crim, - :Zn, - :Indus, - :Chas, - :NOx, - :Rm, - :Age, - :Dis, - :Rad, - :Tax, - :PTRatio, - :Black, - :LStat, - ] -featurenames = vcat(categoricalfeaturenames, continuousfeaturenames) - -if load_pretrained -else - contrasts = PredictMD.contrasts(df, featurenames) -end - -# Define labels -labelname = :MedV - -# Put features and labels in separate dataframes -features_df = df[featurenames] -labels_df = df[[labelname]] - -# Display for exploration -display(DataFrames.head(features_df)) -display(DataFrames.head(labels_df)) - -# View summary statistics for label variable (mean, quartiles, etc.) -DataFrames.describe(labels_df[labelname]) - -# Split data into training set (70%) and testing set (30%) -training_features_df,testing_features_df,traininglabels_df,testing_labels_df = - PredictMD.split_data(features_df,labels_df,0.7); - -# Set up epsilon-SVR model -epsilonsvr_svmreg = PredictMD.singlelabeldataframesvmregression( - featurenames, - labelname; - package = :LIBSVMjl, - svmtype = LIBSVM.EpsilonSVR, - name = "SVM (epsilon-SVR)", - kernel = LIBSVM.Kernel.Linear, - verbose = false, - ) - -if load_pretrained - PredictMD.load!(epsilonsvr_svmreg_filename, epsilonsvr_svmreg) -else - # set feature contrasts - PredictMD.set_feature_contrasts!(epsilonsvr_svmreg , feature_contrasts) - # Train epsilon-SVR model on training set - PredictMD.fit!(epsilonsvr_svmreg,training_features_df,traininglabels_df,) -end - -# Plot true values versus predicted values for epsilon-SVR on training set -epsilonsvr_svmreg_plot_training = PredictMD.plotsinglelabelregressiontrueversuspredicted( - epsilonsvr_svmreg, - training_features_df, - traininglabels_df, - labelname, - ) - -# Plot true values versus predicted values for epsilon-SVR on testing set -epsilonsvr_svmreg_plot_testing = PredictMD.plotsinglelabelregressiontrueversuspredicted( - epsilonsvr_svmreg, - testing_features_df, - testing_labels_df, - labelname, - ) - -# Evaluate performance of epsilon-SVR on training set -PredictMD.singlelabelregressionmetrics( - epsilonsvr_svmreg, - training_features_df, - traininglabels_df, - labelname, - ) - -# Evaluate performance of epsilon-SVR on testing set -PredictMD.singlelabelregressionmetrics( - epsilonsvr_svmreg, - testing_features_df, - testing_labels_df, - labelname, - ) - -# Set up nu-SVR model -nusvr_svmreg = PredictMD.singlelabeldataframesvmregression( - featurenames, - labelname; - package = :LIBSVMjl, - svmtype = LIBSVM.NuSVR, - name = "SVM (nu-SVR)", - kernel = LIBSVM.Kernel.Linear, - verbose = false, - ) - -if load_pretrained - PredictMD.load!(nusvr_svmreg_filename, nusvr_svmreg) -else - # set feature contrasts - PredictMD.set_feature_contrasts!(nusvr_svmreg , feature_contrasts) - # Train nu-SVR model - PredictMD.fit!(nusvr_svmreg,training_features_df,traininglabels_df,) -end - -# Plot true values versus predicted values for nu-SVR on training set -nusvr_svmreg_plot_training = PredictMD.plotsinglelabelregressiontrueversuspredicted( - nusvr_svmreg, - training_features_df, - traininglabels_df, - labelname, - ) - -# Plot true values versus predicted values for nu-SVR on testing set -nusvr_svmreg_plot_testing = PredictMD.plotsinglelabelregressiontrueversuspredicted( - nusvr_svmreg, - testing_features_df, - testing_labels_df, - labelname, - ) - -# Evaluate performance of nu-SVR on training set -PredictMD.singlelabelregressionmetrics( - nusvr_svmreg, - training_features_df, - traininglabels_df, - labelname, - ) - -# Evaluate performance of nu-SVR on testing set -PredictMD.singlelabelregressionmetrics( - nusvr_svmreg, - testing_features_df, - testing_labels_df, - labelname, - ) - -if save_trained - PredictMD.save(epsilonsvr_svmreg_filename, epsilonsvr_svmreg) - PredictMD.save(nusvr_svmreg_filename, nusvr_svmreg) -end - -# We can use the PredictMD.predict() function to get the real-valued predictions -# output by each of regression models. - -# Get real-valued predictions from each model for training set -PredictMD.predict(epsilonsvr_svmreg,training_features_df) -PredictMD.predict(nusvr_svmreg,training_features_df) - -# Get real-valued predictions from each model for testing set -PredictMD.predict(epsilonsvr_svmreg,testing_features_df) -PredictMD.predict(nusvr_svmreg,testing_features_df) diff --git a/examples_old/breast_cancer_biopsy.jl b/examples_old/breast_cancer_biopsy.jl deleted file mode 100644 index 972324a1e..000000000 --- a/examples_old/breast_cancer_biopsy.jl +++ /dev/null @@ -1,928 +0,0 @@ -############################################################################## -############################################################################## -### INSTRUCTIONS FOR USING THIS FILE: ######################################## -############################################################################## -############################################################################## -## -## If you are running this file for the first time and/or if you do not have -## any trained models saved to disk, take the following steps: -## 1. Uncomment lines 27 and 28 -## 2. Comment out lines 30 and 31 -## 3. Set the variables on lines 33 through 38 to the filenames where you -## would like to save your models after training them. -## 4. Run the entire file. This will train the models, compare their -## performance, print metrics to the console, generate plots, and save -## the trained models to disk. -## -## If you already have trained models saved, and you would like to load those -## models from disk, take the following steps: -## 1. Comment out lines 27 and 28 -## 2. Uncomment lines 30 and 31 -## 3. Make sure the variables on lines 33 through 38 are set to the -## filenames where your trained models are currently saved. -## 4. Run the entire file. This will load the trained models from disk, -## compare their performance, print metrics to the console, and generate -## plots. - -# ENV["LOADTRAINEDMODELSFROMFILE"] = "false" -# ENV["SAVETRAINEDMODELSTOFILE"] = "true" - -# ENV["LOADTRAINEDMODELSFROMFILE"] = "true" -# ENV["SAVETRAINEDMODELSTOFILE"] = "false" - -logisticclassifier_filename = "/Users/dilum/Desktop/logisticclassifier.jld2" -probitclassifier_filename = "/Users/dilum/Desktop/probitclassifier.jld2" -rfclassifier_filename = "/Users/dilum/Desktop/rfclassifier.jld2" -csvc_svmclassifier_filename = "/Users/dilum/Desktop/csvc_svmclassifier.jld2" -nusvc_svmclassifier_filename = "/Users/dilum/Desktop/nusvc_svmclassifier.jld2" -knetmlp_filename = "/Users/dilum/Desktop/knetmlpclassifier.jld2" - -############################################################################## -############################################################################## -### Section 1: Setup ######################################################### -############################################################################## -############################################################################## - -# import required packages -import PredictMD -import DataFrames -import Knet -import LIBSVM -import RDatasets -import StatsBase - -# set the seed of the global random number generator -# this makes the results reproducible -srand(999) - -############################################################################## -############################################################################## -### Section 2: Prepare data ################################################## -############################################################################## -############################################################################## - -# Import breast cancer biopsy data -df = RDatasets.dataset("MASS", "biopsy") - -# Remove rows with missing data -DataFrames.dropmissing!(df) - -# Shuffle rows -PredictMD.shuffle_rows!(df) - -# Define features -categoricalfeaturenames = Symbol[] -continuousfeaturenames = Symbol[ - :V1, - :V2, - :V3, - :V4, - :V5, - :V6, - :V7, - :V8, - :V9, - ] -featurenames = vcat(categoricalfeaturenames, continuousfeaturenames) - -if get(ENV, "LOADTRAINEDMODELSFROMFILE", "") == "true" -else - contrasts = PredictMD.contrasts(df, featurenames) -end - -# Define labels -labelname = :Class -negativeclass = "benign" -positiveclass = "malignant" -labellevels = [negativeclass, positiveclass] - -# Put features and labels in separate dataframes -features_df = df[featurenames] -labels_df = df[[labelname]] - -# Split data into training set (70%) and testing set (30%) -training_features_df,testing_features_df,traininglabels_df,testing_labels_df = - PredictMD.split_data(features_df,labels_df,0.7) - -############################################################################## -############################################################################## -### Section 3: Apply the SMOTE algorithm to the training set ################# -############################################################################## -############################################################################## - -# Examine prevalence of each class in training set -DataFrames.describe(traininglabels_df[labelname]) -StatsBase.countmap(traininglabels_df[labelname]) - -# We see that malignant is minority class and benign is majority class. -# The ratio of malignant:benign is somewhere between 1:2.5 and 1:3 (depending -# on random seed). We would like that ratio to be 1:1. We will use SMOTE -# to generate synthetic minority class samples. We will also undersample the -# minority class. The result will be a balanced training set. -majorityclass = "benign" -minorityclass = "malignant" - -smotedtraining_features_df, smotedtraininglabels_df = PredictMD.smote( - training_features_df, - traininglabels_df, - featurenames, - labelname; - majorityclass = majorityclass, - minorityclass = minorityclass, - pct_over = 100, # how much to oversample the minority class - minority_to_majority_ratio = 1.0, # desired minority:majority ratio - k = 5, - ) - -# Examine prevalence of each class in smoted training set -DataFrames.describe(smotedtraininglabels_df[labelname]) -StatsBase.countmap(smotedtraininglabels_df[labelname]) - -# Now we have a ratio of malignant:benign that is 1:1. - -############################################################################## -############################################################################## -### Section 4: Set up and train models ####################################### -############################################################################## -############################################################################## - -############################################################################## -## Logistic "regression" classifier ########################################## -############################################################################## - -# Set up logistic classifier model -logisticclassifier = PredictMD.singlelabelbinaryclassdataframelogisticclassifier( - featurenames, - labelname, - labellevels; - package = :GLMjl, - intercept = true, # optional, defaults to true - name = "Logistic regression", # optional - ) - -if get(ENV, "LOADTRAINEDMODELSFROMFILE", "") == "true" - PredictMD.load!(logisticclassifier_filename, logisticclassifier) -else - # set feature contrasts - PredictMD.set_feature_contrasts!(logisticclassifier , feature_contrasts) - # Train logistic classifier model on smoted training set - PredictMD.fit!( - logisticclassifier, - smotedtraining_features_df, - smotedtraininglabels_df, - ) -end - -# View coefficients, p values, etc. for underlying logistic regression -PredictMD.get_underlying(logisticclassifier) - -# Plot classifier histogram for logistic classifier on smoted training set -logistic_hist_training = PredictMD.plotsinglelabelbinaryclassifierhistogram( - logisticclassifier, - smotedtraining_features_df, - smotedtraininglabels_df, - labelname, - labellevels, - ) -PredictMD.open(logistic_hist_training) - -# Plot classifier histogram for logistic classifier on testing set -logistic_hist_testing = PredictMD.plotsinglelabelbinaryclassifierhistogram( - logisticclassifier, - testing_features_df, - testing_labels_df, - labelname, - labellevels, - ) -PredictMD.open(logistic_hist_testing) - -# Evaluate performance of logistic classifier on smoted training set -PredictMD.singlelabelbinaryclassificationmetrics( - logisticclassifier, - testing_features_df, - testing_labels_df, - labelname, - positiveclass; - sensitivity = 0.95, - ) - -# Evaluate performance of logistic classifier on testing set -PredictMD.singlelabelbinaryclassificationmetrics( - logisticclassifier, - testing_features_df, - testing_labels_df, - labelname, - positiveclass; - sensitivity = 0.95, - ) - -############################################################################## -## Probit "regression" classifier ############################################ -############################################################################## - -# Set up probit classifier model -probitclassifier = PredictMD.singlelabelbinaryclassdataframeprobitclassifier( - featurenames, - labelname, - labellevels; - package = :GLMjl, - intercept = true, # optional, defaults to true - name = "Probit regression", # optional - ) - -if get(ENV, "LOADTRAINEDMODELSFROMFILE", "") == "true" - PredictMD.load!(probitclassifier_filename, probitclassifier) -else - # set feature contrasts - PredictMD.set_feature_contrasts!(probitclassifier , feature_contrasts) - # Train probit classifier model on smoted training set - PredictMD.fit!( - probitclassifier, - smotedtraining_features_df, - smotedtraininglabels_df, - ) -end - -# View coefficients, p values, etc. for underlying probit regression -PredictMD.get_underlying(probitclassifier) - -# Plot classifier histogram for probit classifier on smoted training set -probitclassifier_hist_training = PredictMD.plotsinglelabelbinaryclassifierhistogram( - probitclassifier, - smotedtraining_features_df, - smotedtraininglabels_df, - labelname, - labellevels, - ) -PredictMD.open(probitclassifier_hist_training) - -# Plot classifier histogram for probit classifier on testing set -probitclassifier_hist_testing = PredictMD.plotsinglelabelbinaryclassifierhistogram( - probitclassifier, - testing_features_df, - testing_labels_df, - labelname, - labellevels, - ) -PredictMD.open(probitclassifier_hist_testing) - -# Evaluate performance of probit classifier on smoted training set -PredictMD.singlelabelbinaryclassificationmetrics( - probitclassifier, - smotedtraining_features_df, - smotedtraininglabels_df, - labelname, - positiveclass; - sensitivity = 0.95, - ) - -# Evaluate performance of probit classifier on testing set -PredictMD.singlelabelbinaryclassificationmetrics( - probitclassifier, - testing_features_df, - testing_labels_df, - labelname, - positiveclass; - sensitivity = 0.95, - ) - -############################################################################## -## Random forest classifier ################################################## -############################################################################## - -# Set up random forest classifier model -rfclassifier = PredictMD.singlelabelmulticlassdataframerandomforestclassifier( - featurenames, - labelname, - labellevels; - nsubfeatures = 4, # number of subfeatures; defaults to 2 - ntrees = 200, # number of trees; defaults to 10 - package = :DecisionTreejl, - name = "Random forest" # optional - ) - -if get(ENV, "LOADTRAINEDMODELSFROMFILE", "") == "true" - PredictMD.load!(rfclassifier_filename, rfclassifier) -else - # set feature contrasts - PredictMD.set_feature_contrasts!(rfclassifier , feature_contrasts) - # Train random forest classifier model on smoted training set - PredictMD.fit!( - rfclassifier, - smotedtraining_features_df, - smotedtraininglabels_df, - ) -end - -# Plot classifier histogram for random forest classifier on smoted training set -rfclassifier_hist_training = PredictMD.plotsinglelabelbinaryclassifierhistogram( - rfclassifier, - smotedtraining_features_df, - smotedtraininglabels_df, - labelname, - labellevels, - ) -PredictMD.open(rfclassifier_hist_training) - -# Plot classifier histogram for random forest classifier on testing set -rfclassifier_hist_testing = PredictMD.plotsinglelabelbinaryclassifierhistogram( - rfclassifier, - testing_features_df, - testing_labels_df, - labelname, - labellevels, - ) -PredictMD.open(rfclassifier_hist_testing) - -# Evaluate performance of random forest classifier on smoted training set -PredictMD.singlelabelbinaryclassificationmetrics( - rfclassifier, - smotedtraining_features_df, - smotedtraininglabels_df, - labelname, - positiveclass; - sensitivity = 0.95, - ) - -# Evaluate performance of random forest on testing set -PredictMD.singlelabelbinaryclassificationmetrics( - rfclassifier, - testing_features_df, - testing_labels_df, - labelname, - positiveclass; - sensitivity = 0.95, - ) - -############################################################################## -## Support vector machine (C support vector classifier) ###################### -############################################################################## - -# Set up C-SVC model -csvc_svmclassifier = PredictMD.singlelabelmulticlassdataframesvmclassifier( - featurenames, - labelname, - labellevels; - package = :LIBSVMjl, - svmtype = LIBSVM.SVC, - name = "SVM (C-SVC)", - verbose = false, - ) - -if get(ENV, "LOADTRAINEDMODELSFROMFILE", "") == "true" - PredictMD.load!(csvc_svmclassifier_filename, csvc_svmclassifier) -else - # set feature contrasts - PredictMD.set_feature_contrasts!(csvc_svmclassifier , feature_contrasts) - # Train C-SVC model on smoted training set - PredictMD.fit!( - csvc_svmclassifier, - smotedtraining_features_df, - smotedtraininglabels_df, - ) -end - -# Plot classifier histogram for C-SVC on smoted training set -csvc_svmclassifier_hist_training = PredictMD.plotsinglelabelbinaryclassifierhistogram( - csvc_svmclassifier, - smotedtraining_features_df, - smotedtraininglabels_df, - labelname, - labellevels, - ) -PredictMD.open(csvc_svmclassifier_hist_training) - -# Plot classifier histogram for C-SVC on testing set -csvc_svmclassifier_hist_testing = PredictMD.plotsinglelabelbinaryclassifierhistogram( - csvc_svmclassifier, - testing_features_df, - testing_labels_df, - labelname, - labellevels, - ) -PredictMD.open(csvc_svmclassifier_hist_testing) - -# Evaluate performance of C-SVC on smoted training set -PredictMD.singlelabelbinaryclassificationmetrics( - csvc_svmclassifier, - smotedtraining_features_df, - smotedtraininglabels_df, - labelname, - positiveclass; - sensitivity = 0.95, - ) - -# Evaluate performance of C-SVC on testing set -PredictMD.singlelabelbinaryclassificationmetrics( - csvc_svmclassifier, - testing_features_df, - testing_labels_df, - labelname, - positiveclass; - sensitivity = 0.95, - ) - -############################################################################## -## Support vector machine (nu support vector classifier) ##################### -############################################################################## - -# Set up nu-SVC model -nusvc_svmclassifier = PredictMD.singlelabelmulticlassdataframesvmclassifier( - featurenames, - labelname, - labellevels; - package = :LIBSVMjl, - svmtype = LIBSVM.NuSVC, - name = "SVM (nu-SVC)", - verbose = false, - ) - -if get(ENV, "LOADTRAINEDMODELSFROMFILE", "") == "true" - PredictMD.load!(nusvc_svmclassifier_filename, nusvc_svmclassifier) -else - # set feature contrasts - PredictMD.set_feature_contrasts!(nusvc_svmclassifier , feature_contrasts) - # Train nu-SVC model on smoted training set - PredictMD.fit!( - nusvc_svmclassifier, - smotedtraining_features_df, - smotedtraininglabels_df, - ) -end - -# Plot classifier histogram for nu-SVC on smoted training set -nusvc_svmclassifier_hist_training = PredictMD.plotsinglelabelbinaryclassifierhistogram( - nusvc_svmclassifier, - smotedtraining_features_df, - smotedtraininglabels_df, - labelname, - labellevels, - ) -PredictMD.open(nusvc_svmclassifier_hist_training) - -# Plot classifier histogram for nu-SVC on testing set -nusvc_svmclassifier_hist_testing = PredictMD.plotsinglelabelbinaryclassifierhistogram( - nusvc_svmclassifier, - testing_features_df, - testing_labels_df, - labelname, - labellevels, - ) -PredictMD.open(nusvc_svmclassifier_hist_testing) - -# Evaluate performance of nu-SVC on smoted training set -PredictMD.singlelabelbinaryclassificationmetrics( - nusvc_svmclassifier, - smotedtraining_features_df, - smotedtraininglabels_df, - labelname, - positiveclass; - sensitivity = 0.95, - ) - -# Evaluate performance of SVM on testing set -PredictMD.singlelabelbinaryclassificationmetrics( - nusvc_svmclassifier, - testing_features_df, - testing_labels_df, - labelname, - positiveclass; - sensitivity = 0.95, - ) - -############################################################################## -## Multilayer perceptron (i.e. fully connected feedforward neural network) ### -############################################################################## - -# Define predict function -function knetmlp_predict( - w, # don't put a type annotation on this - x0::AbstractArray; - training::Bool = false, - ) - # x0 = input layer - # x1 = first hidden layer - x1 = Knet.relu.( w[1]*x0 .+ w[2] ) # w[1] = weights, w[2] = biases - # x2 = second hidden layer - x2 = Knet.relu.( w[3]*x1 .+ w[4] ) # w[3] = weights, w[4] = biases - # x3 = output layer - x3 = w[5]*x2 .+ w[6] # w[5] = weights, w[6] = biases - unnormalizedlogprobs = x3 - if training - return unnormalizedlogprobs - else - normalizedlogprobs = Knet.logp(unnormalizedlogprobs, 1) - normalizedprobs = exp.(normalizedlogprobs) - return normalizedprobs - end -end - -if get(ENV, "LOADTRAINEDMODELSFROMFILE", "") == "true" - # No need to initialize weights since we are going to load them from file - knetmlp_modelweights = Any[] -else - # Randomly initialize model weights - knetmlp_modelweights = Any[ - # input layer has dimension contrasts.num_array_columns - # - # first hidden layer (64 neurons): - Cfloat.( - 0.1f0*randn(Cfloat,64,contrasts.num_array_columns) # weights - ), - Cfloat.( - zeros(Cfloat,64,1) # biases - ), - # - # second hidden layer (32 neurons): - Cfloat.( - 0.1f0*randn(Cfloat,32,64) # weights - ), - Cfloat.( - zeros(Cfloat,32,1) # biases - ), - # - # output layer (number of neurons == number of classes): - Cfloat.( - 0.1f0*randn(Cfloat,2,32) # weights - ), - Cfloat.( - zeros(Cfloat,2,1) # biases - ), - ] -end - -# Define loss function -function knetmlp_loss( - predict::Function, - modelweights, # don't put a type annotation on this - x::AbstractArray, - ytrue::AbstractArray; - L1::Real = Cfloat(0), - L2::Real = Cfloat(0), - ) - loss = Knet.nll( - predict(modelweights, x; training = true), - ytrue, - 1, # d = 1 means that instances are in columns - ) - if L1 != 0 - loss += L1 * sum(sum(abs, w_i) for w_i in modelweights[1:2:end]) - end - if L2 != 0 - loss += L2 * sum(sum(abs2, w_i) for w_i in modelweights[1:2:end]) - end - return loss -end - -# Define loss hyperparameters -knetmlp_losshyperparameters = Dict() -knetmlp_losshyperparameters[:L1] = Cfloat(0.0) -knetmlp_losshyperparameters[:L2] = Cfloat(0.0) - -# Select optimization algorithm -knetmlp_optimizationalgorithm = :Momentum - -# Set optimization hyperparameters -knetmlp_optimizerhyperparameters = Dict() - -# Set the minibatch size -knetmlp_minibatchsize = 48 - -# Set the max number of epochs. After training, look at the learning curve. If -# it looks like the model has not yet converged, raise maxepochs. If it looks -# like the loss has hit a plateau and you are worried about overfitting, lower -# maxepochs. -knetmlp_maxepochs = 500 - -# Set up multilayer perceptron model -knetmlpclassifier = PredictMD.singlelabelmulticlassdataframeknetclassifier( - featurenames, - labelname, - labellevels; - package = :Knetjl, - name = "Knet MLP", - predict = knetmlp_predict, - loss = knetmlp_loss, - losshyperparameters = knetmlp_losshyperparameters, - optimizationalgorithm = knetmlp_optimizationalgorithm, - optimizerhyperparameters = knetmlp_optimizerhyperparameters, - minibatchsize = knetmlp_minibatchsize, - modelweights = knetmlp_modelweights, - printlosseverynepochs = 100, # if 0, will not print at all - maxepochs = knetmlp_maxepochs, - ) - -if get(ENV, "LOADTRAINEDMODELSFROMFILE", "") == "true" - PredictMD.load!(knetmlp_filename, knetmlpclassifier) -else - # set feature contrasts - PredictMD.set_feature_contrasts!(knetmlpclassifier , feature_contrasts) - # Train multilayer perceptron model on training set - PredictMD.fit!( - knetmlpclassifier, - smotedtraining_features_df, - smotedtraininglabels_df, - ) -end - -# Plot learning curve: loss vs. epoch -knet_learningcurve_lossvsepoch = PredictMD.plotlearningcurve( - knetmlpclassifier, - :lossvsepoch; - ) -PredictMD.open(knet_learningcurve_lossvsepoch) - -# Plot learning curve: loss vs. epoch, skip the first 10 epochs -knet_learningcurve_lossvsepoch_skip10epochs = PredictMD.plotlearningcurve( - knetmlpclassifier, - :lossvsepoch; - startat = 10, - endat = :end, - ) -PredictMD.open(knet_learningcurve_lossvsepoch_skip10epochs) - -# Plot learning curve: loss vs. iteration -knet_learningcurve_lossvsiteration = PredictMD.plotlearningcurve( - knetmlpclassifier, - :lossvsiteration; - window = 50, - sampleevery = 10, - ) -PredictMD.open(knet_learningcurve_lossvsiteration) - -# Plot learning curve: loss vs. iteration, skip the first 100 iterations -knet_learningcurve_lossvsiteration_skip100iterations = PredictMD.plotlearningcurve( - knetmlpclassifier, - :lossvsiteration; - window = 50, - sampleevery = 10, - startat = 100, - endat = :end, - ) -PredictMD.open(knet_learningcurve_lossvsiteration_skip100iterations) - -# Plot classifier histogram for multilayer perceptron on smoted training set -knetmlpclassifier_hist_training = PredictMD.plotsinglelabelbinaryclassifierhistogram( - knetmlpclassifier, - smotedtraining_features_df, - smotedtraininglabels_df, - labelname, - labellevels, - ) -PredictMD.open(knetmlpclassifier_hist_training) - -# Plot classifier histogram for multilayer perceptron on testing set -knetmlpclassifier_hist_testing = PredictMD.plotsinglelabelbinaryclassifierhistogram( - knetmlpclassifier, - testing_features_df, - testing_labels_df, - labelname, - labellevels, - ) -PredictMD.open(knetmlpclassifier_hist_testing) - -# Evaluate performance of multilayer perceptron on smoted training set -PredictMD.singlelabelbinaryclassificationmetrics( - knetmlpclassifier, - smotedtraining_features_df, - smotedtraininglabels_df, - labelname, - positiveclass; - sensitivity = 0.95, - ) - -# Evaluate performance of multilayer perceptron on testing set -PredictMD.singlelabelbinaryclassificationmetrics( - knetmlpclassifier, - testing_features_df, - testing_labels_df, - labelname, - positiveclass; - sensitivity = 0.95, - ) - -############################################################################## -############################################################################## -## Section 5: Compare performance of all models ############################## -############################################################################## -############################################################################## - -# Compare performance of all models on smoted training set -showall(PredictMD.singlelabelbinaryclassificationmetrics( - [ - logisticclassifier, - probitclassifier, - rfclassifier, - csvc_svmclassifier, - nusvc_svmclassifier, - knetmlpclassifier, - ], - training_features_df, - traininglabels_df, - labelname, - positiveclass; - sensitivity = 0.95, - )) -showall(PredictMD.singlelabelbinaryclassificationmetrics( - [ - logisticclassifier, - probitclassifier, - rfclassifier, - csvc_svmclassifier, - nusvc_svmclassifier, - knetmlpclassifier, - ], - training_features_df, - traininglabels_df, - labelname, - positiveclass; - specificity = 0.95, - )) -showall(PredictMD.singlelabelbinaryclassificationmetrics( - [ - logisticclassifier, - probitclassifier, - rfclassifier, - csvc_svmclassifier, - nusvc_svmclassifier, - knetmlpclassifier, - ], - training_features_df, - traininglabels_df, - labelname, - positiveclass; - maximize = :f1score, - )) -showall(PredictMD.singlelabelbinaryclassificationmetrics( - [ - logisticclassifier, - probitclassifier, - rfclassifier, - csvc_svmclassifier, - nusvc_svmclassifier, - knetmlpclassifier, - ], - training_features_df, - traininglabels_df, - labelname, - positiveclass; - maximize = :cohen_kappa, - )) - -# Compare performance of all models on testing set -showall(PredictMD.singlelabelbinaryclassificationmetrics( - [ - logisticclassifier, - probitclassifier, - rfclassifier, - csvc_svmclassifier, - nusvc_svmclassifier, - knetmlpclassifier, - ], - testing_features_df, - testing_labels_df, - labelname, - positiveclass; - sensitivity = 0.95, - )) -showall(PredictMD.singlelabelbinaryclassificationmetrics( - [ - logisticclassifier, - probitclassifier, - rfclassifier, - csvc_svmclassifier, - nusvc_svmclassifier, - knetmlpclassifier, - ], - testing_features_df, - testing_labels_df, - labelname, - positiveclass; - specificity = 0.95, - )) -showall(PredictMD.singlelabelbinaryclassificationmetrics( - [ - logisticclassifier, - probitclassifier, - rfclassifier, - csvc_svmclassifier, - nusvc_svmclassifier, - knetmlpclassifier, - ], - testing_features_df, - testing_labels_df, - labelname, - positiveclass; - maximize = :f1score, - )) -showall(PredictMD.singlelabelbinaryclassificationmetrics( - [ - logisticclassifier, - probitclassifier, - rfclassifier, - csvc_svmclassifier, - nusvc_svmclassifier, - knetmlpclassifier, - ], - testing_features_df, - testing_labels_df, - labelname, - positiveclass; - maximize = :cohen_kappa, - )) - -# Plot receiver operating characteristic curves for all models on testing set. -rocplottesting = PredictMD.plotroccurves( - [ - logisticclassifier, - probitclassifier, - rfclassifier, - csvc_svmclassifier, - nusvc_svmclassifier, - knetmlpclassifier, - ], - testing_features_df, - testing_labels_df, - labelname, - positiveclass, - ) -PredictMD.open(rocplottesting) - -# Plot precision-recall curves for all models on testing set. -prplottesting = PredictMD.plotprcurves( - [ - logisticclassifier, - probitclassifier, - rfclassifier, - csvc_svmclassifier, - nusvc_svmclassifier, - knetmlpclassifier, - ], - testing_features_df, - testing_labels_df, - labelname, - positiveclass, - ) -PredictMD.open(prplottesting) - -############################################################################## -############################################################################## -### Section 6: Save trained models to file (if desired) ####################### -############################################################################## -############################################################################## - -if get(ENV, "SAVETRAINEDMODELSTOFILE", "") == "true" - PredictMD.save(logisticclassifier_filename, logisticclassifier) - PredictMD.save(probitclassifier_filename, probitclassifier) - PredictMD.save(rfclassifier_filename, rfclassifier) - PredictMD.save(csvc_svmclassifier_filename, csvc_svmclassifier) - PredictMD.save(nusvc_svmclassifier_filename, nusvc_svmclassifier) - PredictMD.save(knetmlp_filename, knetmlpclassifier) -end - -############################################################################## -############################################################################## -## Appendix A: Directly access the output of classification models ########### -############################################################################## -############################################################################## - -# We can use the PredictMD.predict_proba() function to get the probabilities output -# by each of the classification models. - -# Get probabilities from each model for smoted training set -PredictMD.predict_proba(logisticclassifier,smotedtraining_features_df,) -PredictMD.predict_proba(probitclassifier,smotedtraining_features_df,) -PredictMD.predict_proba(rfclassifier,smotedtraining_features_df,) -PredictMD.predict_proba(csvc_svmclassifier,smotedtraining_features_df,) -PredictMD.predict_proba(nusvc_svmclassifier,smotedtraining_features_df,) -PredictMD.predict_proba(knetmlpclassifier,smotedtraining_features_df,) - -# Get probabilities from each model for testing set -PredictMD.predict_proba(logisticclassifier,testing_features_df,) -PredictMD.predict_proba(probitclassifier,testing_features_df,) -PredictMD.predict_proba(rfclassifier,testing_features_df,) -PredictMD.predict_proba(csvc_svmclassifier,testing_features_df,) -PredictMD.predict_proba(nusvc_svmclassifier,testing_features_df,) -PredictMD.predict_proba(knetmlpclassifier,testing_features_df,) - -# If we want to get predicted classes instead of probabilities, we can use the -# PredictMD.predict() function to get the class predictions output by each of the -# classification models. For each sample, PredictMD.predict() will select the class -# with the highest probability. In the case of binary classification, this is -# equivalent to using a threshold of 0.5. - -# Get class predictions from each model for smoted training set -PredictMD.predict(logisticclassifier,smotedtraining_features_df,) -PredictMD.predict(probitclassifier,smotedtraining_features_df,) -PredictMD.predict(rfclassifier,smotedtraining_features_df,) -PredictMD.predict(csvc_svmclassifier,smotedtraining_features_df,) -PredictMD.predict(nusvc_svmclassifier,smotedtraining_features_df,) -PredictMD.predict(knetmlpclassifier,smotedtraining_features_df,) - -# Get class predictions from each model for testing set -PredictMD.predict(logisticclassifier,testing_features_df,) -PredictMD.predict(probitclassifier,testing_features_df,) -PredictMD.predict(rfclassifier,testing_features_df,) -PredictMD.predict(csvc_svmclassifier,testing_features_df,) -PredictMD.predict(nusvc_svmclassifier,testing_features_df,) -PredictMD.predict(knetmlpclassifier,testing_features_df,) diff --git a/test/runtests.jl b/test/runtests.jl index 800190139..5411e9236 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -26,13 +26,14 @@ Base.Test.@testset "PredictMD test suite" begin include("cpu/unit/utils/test_fix_vector_type.jl") end end - Base.Test.@testset "Functional tests (CPU)" begin - info("INFO Running functional tests (CPU)") - Base.Test.@testset "Boston housing regression" begin - include("cpu/functional/bostonhousing/setup_bostonhousing.jl") - end - Base.Test.@testset "Breast cancer biopsy classification" begin - include("cpu/functional/breastcancerbiopsy/setup_breastcancerbiopsy.jl") - end - end + include("../docs/test_examples.jl") + # Base.Test.@testset "Functional tests (CPU)" begin + # info("INFO Running functional tests (CPU)") + # Base.Test.@testset "Boston housing regression" begin + # include("cpu/functional/bostonhousing/setup_bostonhousing.jl") + # end + # Base.Test.@testset "Breast cancer biopsy classification" begin + # include("cpu/functional/breastcancerbiopsy/setup_breastcancerbiopsy.jl") + # end + # end end From a8038e54e18abb88a33a68b79bec01490b7ba77d Mon Sep 17 00:00:00 2001 From: Dilum Aluthge Date: Mon, 21 May 2018 17:24:18 -0400 Subject: [PATCH 24/62] Progress commit --- docs/test_examples.jl | 11 ----------- test/runtests.jl | 26 ++++++++++++++++---------- 2 files changed, 16 insertions(+), 21 deletions(-) diff --git a/docs/test_examples.jl b/docs/test_examples.jl index f9d74cfac..e69de29bb 100644 --- a/docs/test_examples.jl +++ b/docs/test_examples.jl @@ -1,11 +0,0 @@ -import Base.Test - -Base.Test.@testset "Test examples (CPU)" begin - info("INFO testing examples (CPU)") - Base.Test.@testset "Boston housing regression" begin - # include("") - end - Base.Test.@testset "Breast cancer biopsy classification" begin - # include("") - end -end diff --git a/test/runtests.jl b/test/runtests.jl index 5411e9236..0f329adcc 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -12,28 +12,34 @@ println(string("PredictMD Version ", PredictMD.VERSION)) ENV["PREDICTMD_RUNTESTS"] = "true" Base.Test.@testset "PredictMD test suite" begin + # Base.Test.@testset "Unit tests (CPU)" begin info("INFO Running unit tests (CPU)") + # Base.Test.@testset "base" begin include("cpu/unit/base/test_version.jl") end + # Base.Test.@testset "metrics" begin include("cpu/unit/metrics/test_coefficientofdetermination.jl") include("cpu/unit/metrics/test_cohenkappa.jl") end + # Base.Test.@testset "utils" begin include("cpu/unit/utils/test_fix_dict_type.jl") include("cpu/unit/utils/test_fix_vector_type.jl") end end - include("../docs/test_examples.jl") - # Base.Test.@testset "Functional tests (CPU)" begin - # info("INFO Running functional tests (CPU)") - # Base.Test.@testset "Boston housing regression" begin - # include("cpu/functional/bostonhousing/setup_bostonhousing.jl") - # end - # Base.Test.@testset "Breast cancer biopsy classification" begin - # include("cpu/functional/breastcancerbiopsy/setup_breastcancerbiopsy.jl") - # end - # end + # + Base.Test.@testset "Test examples (CPU)" begin + info("INFO testing examples (CPU)") + # + Base.Test.@testset "Boston housing regression" begin + include("") + end + # + Base.Test.@testset "Breast cancer biopsy classification" begin + include("") + end + end end From a9bf4d7691f8ab05756bffef0706df1744e74f8a Mon Sep 17 00:00:00 2001 From: Dilum Aluthge Date: Mon, 21 May 2018 17:24:27 -0400 Subject: [PATCH 25/62] Progress commit --- docs/test_examples.jl | 0 1 file changed, 0 insertions(+), 0 deletions(-) delete mode 100644 docs/test_examples.jl diff --git a/docs/test_examples.jl b/docs/test_examples.jl deleted file mode 100644 index e69de29bb..000000000 From dcda906774d9e3cef884fe266e9e4941caa14dd9 Mon Sep 17 00:00:00 2001 From: Dilum Aluthge Date: Mon, 21 May 2018 17:51:28 -0400 Subject: [PATCH 26/62] Progress commit --- docs/make_docs.jl | 40 ++++++++++++++++++++++++++++++++++++++++ test/runtests.jl | 11 ++--------- 2 files changed, 42 insertions(+), 9 deletions(-) diff --git a/docs/make_docs.jl b/docs/make_docs.jl index 25ee0804a..f20aeeb73 100644 --- a/docs/make_docs.jl +++ b/docs/make_docs.jl @@ -2,6 +2,46 @@ import Documenter import Literate import PredictMD +examples_input_directory = joinpath( + @__DIR__, + "..", + "examples", + ) + +examples_output_directory = joinpath( + @__DIR__, + "", + "", + "", + "", + ) + +Literate.markdown( + , + examples_output_directory, + ) +Literate.notebook( + , + examples_output_directory, + ) +Literate.script( + , + examples_output_directory, + ) + +Literate.markdown( + , + examples_output_directory, + ) +Literate.notebook( + , + examples_output_directory, + ) +Literate.script( + , + examples_output_directory, + ) + Documenter.makedocs( modules = [PredictMD], sitename = "PredictMD.jl", diff --git a/test/runtests.jl b/test/runtests.jl index 0f329adcc..534c17b64 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -12,34 +12,27 @@ println(string("PredictMD Version ", PredictMD.VERSION)) ENV["PREDICTMD_RUNTESTS"] = "true" Base.Test.@testset "PredictMD test suite" begin - # Base.Test.@testset "Unit tests (CPU)" begin info("INFO Running unit tests (CPU)") - # Base.Test.@testset "base" begin include("cpu/unit/base/test_version.jl") end - # Base.Test.@testset "metrics" begin include("cpu/unit/metrics/test_coefficientofdetermination.jl") include("cpu/unit/metrics/test_cohenkappa.jl") end - # Base.Test.@testset "utils" begin include("cpu/unit/utils/test_fix_dict_type.jl") include("cpu/unit/utils/test_fix_vector_type.jl") end end - # Base.Test.@testset "Test examples (CPU)" begin info("INFO testing examples (CPU)") - # Base.Test.@testset "Boston housing regression" begin - include("") + # include("") end - # Base.Test.@testset "Breast cancer biopsy classification" begin - include("") + # include("") end end end From ffab9e49012988fa31479fff33e4e46c5ea32e55 Mon Sep 17 00:00:00 2001 From: Dilum Aluthge Date: Mon, 21 May 2018 18:10:19 -0400 Subject: [PATCH 27/62] Progress commit --- examples/boston_housing.jl | 941 +++++++++ examples/breast_cancer_biopsy.jl | 1709 +++++++++++++++++ .../bostonhousing/run_bostonhousing.jl | 458 ----- .../bostonhousing/setup_bostonhousing.jl | 23 - .../run_breastcancerbiopsy.jl | 839 -------- .../setup_breastcancerbiopsy.jl | 31 - 6 files changed, 2650 insertions(+), 1351 deletions(-) create mode 100644 examples/boston_housing.jl create mode 100644 examples/breast_cancer_biopsy.jl diff --git a/examples/boston_housing.jl b/examples/boston_housing.jl new file mode 100644 index 000000000..e7ea79123 --- /dev/null +++ b/examples/boston_housing.jl @@ -0,0 +1,941 @@ +ENV["linearreg_filename"] = string(tempname(), "_linearreg.jld2") +ENV["randomforestreg_filename"] = string(tempname(), "_randomforestreg.jld2") +ENV["knetmlpreg_filename"] = string(tempname(), "_knetmlpreg.jld2") + +Base.Test.@test(!isfile(ENV["linearreg_filename"])) +Base.Test.@test(!isfile(ENV["randomforestreg_filename"])) +Base.Test.@test(!isfile(ENV["knetmlpreg_filename"])) + +ENV["LOADTRAINEDMODELSFROMFILE"] = "false" +ENV["SAVETRAINEDMODELSTOFILE"] = "true" + +linearreg_filename = ENV["linearreg_filename"] +randomforestreg_filename = ENV["randomforestreg_filename"] +knetmlpreg_filename = ENV["knetmlpreg_filename"] + +############################################################################## +############################################################################## +### Section 1: Setup ######################################################### +############################################################################## +############################################################################## + +# import required packages +import PredictMD +import CSV +import DataFrames +import GZip +import Knet +import LIBSVM +import StatsBase + +# set the seed of the global random number generator +# this makes the results reproducible +srand(999) + +############################################################################## +############################################################################## +### Section 2: Prepare data ################################################## +############################################################################## +############################################################################## + +# Import Boston housing data +df = CSV.read( + GZip.gzopen(joinpath(Pkg.dir("RDatasets"),"data","MASS","Boston.csv.gz")), + DataFrames.DataFrame, + ) + +# Remove rows with missing data +DataFrames.dropmissing!(df) + +# Shuffle rows +PredictMD.shuffle_rows!(df) + +# Define labels +categoricalfeaturenames = Symbol[] +continuousfeaturenames = Symbol[ + :Crim, + :Zn, + :Indus, + :Chas, + :NOx, + :Rm, + :Age, + :Dis, + :Rad, + :Tax, + :PTRatio, + :Black, + :LStat, + ] +featurenames = vcat(categoricalfeaturenames, continuousfeaturenames) + +if get(ENV, "LOADTRAINEDMODELSFROMFILE", "") == "true" +else + feature_contrasts = PredictMD.generate_feature_contrasts(df, featurenames) +end + +# Define labels +labelname = :MedV + +# Put features and labels in separate dataframes +features_df = df[featurenames] +labels_df = df[[labelname]] + +# View summary statistics for label variable (mean, quartiles, etc.) +DataFrames.describe(labels_df[labelname]) + +# Split the data into training (50%), validation (25%), and testing (25%) +trainingandvalidation_features_df, + trainingandvalidation_labels_df, + testing_features_df, + testing_labels_df = PredictMD.split_data( + features_df, + labels_df, + 0.75, # 75% training+validation, 25% testing + ) +training_features_df, + training_labels_df, + validation_features_df, + validation_labels_df = PredictMD.split_data( + trainingandvalidation_features_df, + trainingandvalidation_labels_df, + 2/3, # 2/3 of 75% = 50% training, 1/3 of 75% = 25% validation + ) + +############################################################################## +############################################################################## +### Section 3: Set up and train models ####################################### +############################################################################## +############################################################################## + +############################################################################## +## Linear regression ######################################################### +############################################################################## + +if get(ENV, "LOADTRAINEDMODELSFROMFILE", "") == "true" + linearreg = PredictMD.load_model(linearreg_filename) +else + # Set up linear regression model + linearreg = PredictMD.singlelabeldataframelinearregression( + featurenames, + labelname; + package = :GLMjl, + intercept = true, # optional, defaults to true + interactions = 2, # optional, defaults to 1 + name = "Linear regression", # optional + ) +end + +if get(ENV, "LOADTRAINEDMODELSFROMFILE", "") == "true" +else + # Train linear regression model + PredictMD.fit!(linearreg,training_features_df,training_labels_df,) +end + +# View coefficients, p values, etc. for underlying linear regression +PredictMD.get_underlying(linearreg) + +# Plot true values versus predicted values for linear regression on training set +linearreg_plot_training = PredictMD.plotsinglelabelregressiontrueversuspredicted( + linearreg, + training_features_df, + training_labels_df, + labelname, + ) +PredictMD.open_plot(linearreg_plot_training) + +# Plot true values versus predicted values for linear regression on testing set +linearreg_plot_testing = PredictMD.plotsinglelabelregressiontrueversuspredicted( + linearreg, + testing_features_df, + testing_labels_df, + labelname + ) +PredictMD.open_plot(linearreg_plot_testing) + +# Evaluate performance of linear regression on training set +PredictMD.singlelabelregressionmetrics( + linearreg, + training_features_df, + training_labels_df, + labelname, + ) + +# Evaluate performance of linear regression on testing set +PredictMD.singlelabelregressionmetrics( + linearreg, + testing_features_df, + testing_labels_df, + labelname, + ) + +############################################################################## +## Random forest regression ################################################## +############################################################################## + +if get(ENV, "LOADTRAINEDMODELSFROMFILE", "") == "true" + randomforestreg = PredictMD.load_model(randomforestreg_filename) +else + # Set up random forest regression model + randomforestreg = PredictMD.singlelabeldataframerandomforestregression( + featurenames, + labelname; + nsubfeatures = 2, # number of subfeatures; defaults to 2 + ntrees = 20, # number of trees; defaults to 10 + package = :DecisionTreejl, + name = "Random forest", # optional + feature_contrasts = feature_contrasts, + ) +end + +if get(ENV, "LOADTRAINEDMODELSFROMFILE", "") == "true" +else + # Train random forest model on training set + PredictMD.fit!(randomforestreg,training_features_df,training_labels_df,) +end + +# Plot true values versus predicted values for random forest on training set +randomforestreg_plot_training = PredictMD.plotsinglelabelregressiontrueversuspredicted( + randomforestreg, + training_features_df, + training_labels_df, + labelname, + ) +PredictMD.open_plot(randomforestreg_plot_training) + +# Plot true values versus predicted values for random forest on testing set +randomforestreg_plot_testing = PredictMD.plotsinglelabelregressiontrueversuspredicted( + randomforestreg, + testing_features_df, + testing_labels_df, + labelname, + ) +PredictMD.open_plot(randomforestreg_plot_testing) + +# Evaluate performance of random forest on training set +PredictMD.singlelabelregressionmetrics( + randomforestreg, + training_features_df, + training_labels_df, + labelname, + ) + +# Evaluate performance of random forest on testing set +PredictMD.singlelabelregressionmetrics( + randomforestreg, + testing_features_df, + testing_labels_df, + labelname, + ) + +############################################################################## +## Multilayer perceptron (i.e. fully connected feedforward neural network) ### +############################################################################## + +# Define predict function +function knetmlp_predict( + w, # don't put a type annotation on this + x0::AbstractArray, + ) + # x0 = input layer + # x1 = hidden layer + x1 = Knet.relu.( w[1]*x0 .+ w[2] ) # w[1] = weights, w[2] = biases + # x2 = output layer + x2 = w[3]*x1 .+ w[4] # w[3] = weights, w[4] = biases + return x2 +end + +# Define loss function +function knetmlp_loss( + predict::Function, + modelweights, # don't put a type annotation on this + x::AbstractArray, + ytrue::AbstractArray; + L1::Real = Cfloat(0), + L2::Real = Cfloat(0), + ) + loss = mean( + abs2, + ytrue - predict( + modelweights, + x, + ), + ) + if L1 != 0 + loss += L1 * sum(sum(abs, w_i) for w_i in modelweights[1:2:end]) + end + if L2 != 0 + loss += L2 * sum(sum(abs2, w_i) for w_i in modelweights[1:2:end]) + end + return loss +end + +if get(ENV, "LOADTRAINEDMODELSFROMFILE", "") == "true" + knetmlpreg = PredictMD.load_model(knetmlpreg_filename) +else + # Randomly initialize model weights + knetmlp_modelweights = Any[ + # input layer has dimension contrasts.num_array_columns + # + # hidden layer (10 neurons): + Cfloat.( + 0.1f0*randn(Cfloat,10,feature_contrasts.num_array_columns) # weights + ), + Cfloat.( + zeros(Cfloat,10,1) # biases + ), + # + # output layer (regression nets have exactly 1 neuron in output layer): + Cfloat.( + 0.1f0*randn(Cfloat,1,10) # weights + ), + Cfloat.( + zeros(Cfloat,1,1) # biases + ), + ] + # Define loss hyperparameters + knetmlp_losshyperparameters = Dict() + knetmlp_losshyperparameters[:L1] = Cfloat(0.0) + knetmlp_losshyperparameters[:L2] = Cfloat(0.0) + # Select optimization algorithm + knetmlp_optimizationalgorithm = :Adam + # Set optimization hyperparameters + knetmlp_optimizerhyperparameters = Dict() + # Set the minibatch size + knetmlp_minibatchsize = 48 + # Set the max number of epochs. After training, look at the learning curve. If + # it looks like the model has not yet converged, raise maxepochs. If it looks + # like the loss has hit a plateau and you are worried about overfitting, lower + # maxepochs. + knetmlp_maxepochs = 1_000 + # Set up multilayer perceptron model + knetmlpreg = PredictMD.singlelabeldataframeknetregression( + featurenames, + labelname; + package = :Knetjl, + name = "Knet MLP", + predict = knetmlp_predict, + loss = knetmlp_loss, + losshyperparameters = knetmlp_losshyperparameters, + optimizationalgorithm = knetmlp_optimizationalgorithm, + optimizerhyperparameters = knetmlp_optimizerhyperparameters, + minibatchsize = knetmlp_minibatchsize, + modelweights = knetmlp_modelweights, + maxepochs = knetmlp_maxepochs, + printlosseverynepochs = 100, # if 0, will not print at all + feature_contrasts = feature_contrasts, + ) +end + +if get(ENV, "LOADTRAINEDMODELSFROMFILE", "") == "true" +else + # Train multilayer perceptron model on training set + PredictMD.fit!( + knetmlpreg, + training_features_df, + training_labels_df, + validation_features_df, + validation_labels_df, + ) +end + +# Plot learning curve: loss vs. epoch +knet_learningcurve_lossvsepoch = PredictMD.plotlearningcurve( + knetmlpreg, + :loss_vs_epoch; + ) +PredictMD.open_plot(knet_learningcurve_lossvsepoch) + +# Plot learning curve: loss vs. epoch, skip the first 10 epochs +knet_learningcurve_lossvsepoch_skip10epochs = PredictMD.plotlearningcurve( + knetmlpreg, + :loss_vs_epoch; + startat = 10, + endat = :end, + ) +PredictMD.open_plot(knet_learningcurve_lossvsepoch_skip10epochs) + +# Plot learning curve: loss vs. iteration +knet_learningcurve_lossvsiteration = PredictMD.plotlearningcurve( + knetmlpreg, + :loss_vs_iteration; + window = 50, + sampleevery = 10, + ) +PredictMD.open_plot(knet_learningcurve_lossvsiteration) + +# Plot learning curve: loss vs. iteration, skip the first 100 iterations +knet_learningcurve_lossvsiteration_skip100iterations = PredictMD.plotlearningcurve( + knetmlpreg, + :loss_vs_iteration; + window = 50, + sampleevery = 10, + startat = 100, + endat = :end, + ) +PredictMD.open_plot(knet_learningcurve_lossvsiteration_skip100iterations) + +# Plot true values versus predicted values for multilayer perceptron on training set +knetmlpreg_plot_training = PredictMD.plotsinglelabelregressiontrueversuspredicted( + knetmlpreg, + training_features_df, + training_labels_df, + labelname, + ) +PredictMD.open_plot(knetmlpreg_plot_training) + +# Plot true values versus predicted values for multilayer perceptron on testing set +knetmlpreg_plot_testing = PredictMD.plotsinglelabelregressiontrueversuspredicted( + knetmlpreg, + testing_features_df, + testing_labels_df, + labelname, + ) +PredictMD.open_plot(knetmlpreg_plot_testing) + +# Evaluate performance of multilayer perceptron on training set +PredictMD.singlelabelregressionmetrics( + knetmlpreg, + training_features_df, + training_labels_df, + labelname, + ) + +# Evaluate performance of multilayer perceptron on testing set +PredictMD.singlelabelregressionmetrics( + knetmlpreg, + testing_features_df, + testing_labels_df, + labelname, + ) + +############################################################################## +############################################################################## +### Section 4: Compare performance of all models ############################# +############################################################################## +############################################################################## + +all_models = PredictMD.Fittable[ + linearreg, + randomforestreg, + knetmlpreg, + ] + +# Compare performance of all five models on training set +showall(PredictMD.singlelabelregressionmetrics( + all_models, + training_features_df, + training_labels_df, + labelname, + )) + +# Compare performance of all models on testing set +showall(PredictMD.singlelabelregressionmetrics( + all_models, + testing_features_df, + testing_labels_df, + labelname, + )) + +############################################################################## +############################################################################## +### Section 5: Save trained models to file (if desired) ####################### +############################################################################## +############################################################################## + +if get(ENV, "SAVETRAINEDMODELSTOFILE", "") == "true" + PredictMD.save_model(linearreg_filename, linearreg) + PredictMD.save_model(randomforestreg_filename, randomforestreg) + PredictMD.save_model(knetmlpreg_filename, knetmlpreg) +end + +############################################################################## +############################################################################## +## Appendix A: Directly access the output of regression models ############### +############################################################################## +############################################################################## + +# We can use the PredictMD.predict() function to get the real-valued predictions +# output by each of regression models. + +# Get real-valued predictions from each model for training set +PredictMD.predict(linearreg,training_features_df,) +PredictMD.predict(randomforestreg,training_features_df,) +PredictMD.predict(knetmlpreg,training_features_df,) + +# Get real-valued predictions from each model for testing set +PredictMD.predict(linearreg,testing_features_df,) +PredictMD.predict(randomforestreg,testing_features_df,) +PredictMD.predict(knetmlpreg,testing_features_df,) + + +Base.Test.@test(isfile(ENV["linearreg_filename"])) +Base.Test.@test(isfile(ENV["randomforestreg_filename"])) +Base.Test.@test(isfile(ENV["knetmlpreg_filename"])) + +ENV["LOADTRAINEDMODELSFROMFILE"] = "true" +ENV["SAVETRAINEDMODELSTOFILE"] = "false" + +linearreg_filename = ENV["linearreg_filename"] +randomforestreg_filename = ENV["randomforestreg_filename"] +knetmlpreg_filename = ENV["knetmlpreg_filename"] + +############################################################################## +############################################################################## +### Section 1: Setup ######################################################### +############################################################################## +############################################################################## + +# import required packages +import PredictMD +import CSV +import DataFrames +import GZip +import Knet +import LIBSVM +import StatsBase + +# set the seed of the global random number generator +# this makes the results reproducible +srand(999) + +############################################################################## +############################################################################## +### Section 2: Prepare data ################################################## +############################################################################## +############################################################################## + +# Import Boston housing data +df = CSV.read( + GZip.gzopen(joinpath(Pkg.dir("RDatasets"),"data","MASS","Boston.csv.gz")), + DataFrames.DataFrame, + ) + +# Remove rows with missing data +DataFrames.dropmissing!(df) + +# Shuffle rows +PredictMD.shuffle_rows!(df) + +# Define labels +categoricalfeaturenames = Symbol[] +continuousfeaturenames = Symbol[ + :Crim, + :Zn, + :Indus, + :Chas, + :NOx, + :Rm, + :Age, + :Dis, + :Rad, + :Tax, + :PTRatio, + :Black, + :LStat, + ] +featurenames = vcat(categoricalfeaturenames, continuousfeaturenames) + +if get(ENV, "LOADTRAINEDMODELSFROMFILE", "") == "true" +else + feature_contrasts = PredictMD.generate_feature_contrasts(df, featurenames) +end + +# Define labels +labelname = :MedV + +# Put features and labels in separate dataframes +features_df = df[featurenames] +labels_df = df[[labelname]] + +# View summary statistics for label variable (mean, quartiles, etc.) +DataFrames.describe(labels_df[labelname]) + +# Split the data into training (50%), validation (25%), and testing (25%) +trainingandvalidation_features_df, + trainingandvalidation_labels_df, + testing_features_df, + testing_labels_df = PredictMD.split_data( + features_df, + labels_df, + 0.75, # 75% training+validation, 25% testing + ) +training_features_df, + training_labels_df, + validation_features_df, + validation_labels_df = PredictMD.split_data( + trainingandvalidation_features_df, + trainingandvalidation_labels_df, + 2/3, # 2/3 of 75% = 50% training, 1/3 of 75% = 25% validation + ) + +############################################################################## +############################################################################## +### Section 3: Set up and train models ####################################### +############################################################################## +############################################################################## + +############################################################################## +## Linear regression ######################################################### +############################################################################## + +if get(ENV, "LOADTRAINEDMODELSFROMFILE", "") == "true" + linearreg = PredictMD.load_model(linearreg_filename) +else + # Set up linear regression model + linearreg = PredictMD.singlelabeldataframelinearregression( + featurenames, + labelname; + package = :GLMjl, + intercept = true, # optional, defaults to true + interactions = 2, # optional, defaults to 1 + name = "Linear regression", # optional + ) +end + +if get(ENV, "LOADTRAINEDMODELSFROMFILE", "") == "true" +else + # Train linear regression model + PredictMD.fit!(linearreg,training_features_df,training_labels_df,) +end + +# View coefficients, p values, etc. for underlying linear regression +PredictMD.get_underlying(linearreg) + +# Plot true values versus predicted values for linear regression on training set +linearreg_plot_training = PredictMD.plotsinglelabelregressiontrueversuspredicted( + linearreg, + training_features_df, + training_labels_df, + labelname, + ) +PredictMD.open_plot(linearreg_plot_training) + +# Plot true values versus predicted values for linear regression on testing set +linearreg_plot_testing = PredictMD.plotsinglelabelregressiontrueversuspredicted( + linearreg, + testing_features_df, + testing_labels_df, + labelname + ) +PredictMD.open_plot(linearreg_plot_testing) + +# Evaluate performance of linear regression on training set +PredictMD.singlelabelregressionmetrics( + linearreg, + training_features_df, + training_labels_df, + labelname, + ) + +# Evaluate performance of linear regression on testing set +PredictMD.singlelabelregressionmetrics( + linearreg, + testing_features_df, + testing_labels_df, + labelname, + ) + +############################################################################## +## Random forest regression ################################################## +############################################################################## + +if get(ENV, "LOADTRAINEDMODELSFROMFILE", "") == "true" + randomforestreg = PredictMD.load_model(randomforestreg_filename) +else + # Set up random forest regression model + randomforestreg = PredictMD.singlelabeldataframerandomforestregression( + featurenames, + labelname; + nsubfeatures = 2, # number of subfeatures; defaults to 2 + ntrees = 20, # number of trees; defaults to 10 + package = :DecisionTreejl, + name = "Random forest", # optional + feature_contrasts = feature_contrasts, + ) +end + +if get(ENV, "LOADTRAINEDMODELSFROMFILE", "") == "true" +else + # Train random forest model on training set + PredictMD.fit!(randomforestreg,training_features_df,training_labels_df,) +end + +# Plot true values versus predicted values for random forest on training set +randomforestreg_plot_training = PredictMD.plotsinglelabelregressiontrueversuspredicted( + randomforestreg, + training_features_df, + training_labels_df, + labelname, + ) +PredictMD.open_plot(randomforestreg_plot_training) + +# Plot true values versus predicted values for random forest on testing set +randomforestreg_plot_testing = PredictMD.plotsinglelabelregressiontrueversuspredicted( + randomforestreg, + testing_features_df, + testing_labels_df, + labelname, + ) +PredictMD.open_plot(randomforestreg_plot_testing) + +# Evaluate performance of random forest on training set +PredictMD.singlelabelregressionmetrics( + randomforestreg, + training_features_df, + training_labels_df, + labelname, + ) + +# Evaluate performance of random forest on testing set +PredictMD.singlelabelregressionmetrics( + randomforestreg, + testing_features_df, + testing_labels_df, + labelname, + ) + +############################################################################## +## Multilayer perceptron (i.e. fully connected feedforward neural network) ### +############################################################################## + +# Define predict function +function knetmlp_predict( + w, # don't put a type annotation on this + x0::AbstractArray, + ) + # x0 = input layer + # x1 = hidden layer + x1 = Knet.relu.( w[1]*x0 .+ w[2] ) # w[1] = weights, w[2] = biases + # x2 = output layer + x2 = w[3]*x1 .+ w[4] # w[3] = weights, w[4] = biases + return x2 +end + +# Define loss function +function knetmlp_loss( + predict::Function, + modelweights, # don't put a type annotation on this + x::AbstractArray, + ytrue::AbstractArray; + L1::Real = Cfloat(0), + L2::Real = Cfloat(0), + ) + loss = mean( + abs2, + ytrue - predict( + modelweights, + x, + ), + ) + if L1 != 0 + loss += L1 * sum(sum(abs, w_i) for w_i in modelweights[1:2:end]) + end + if L2 != 0 + loss += L2 * sum(sum(abs2, w_i) for w_i in modelweights[1:2:end]) + end + return loss +end + +if get(ENV, "LOADTRAINEDMODELSFROMFILE", "") == "true" + knetmlpreg = PredictMD.load_model(knetmlpreg_filename) +else + # Randomly initialize model weights + knetmlp_modelweights = Any[ + # input layer has dimension contrasts.num_array_columns + # + # hidden layer (10 neurons): + Cfloat.( + 0.1f0*randn(Cfloat,10,feature_contrasts.num_array_columns) # weights + ), + Cfloat.( + zeros(Cfloat,10,1) # biases + ), + # + # output layer (regression nets have exactly 1 neuron in output layer): + Cfloat.( + 0.1f0*randn(Cfloat,1,10) # weights + ), + Cfloat.( + zeros(Cfloat,1,1) # biases + ), + ] + # Define loss hyperparameters + knetmlp_losshyperparameters = Dict() + knetmlp_losshyperparameters[:L1] = Cfloat(0.0) + knetmlp_losshyperparameters[:L2] = Cfloat(0.0) + # Select optimization algorithm + knetmlp_optimizationalgorithm = :Adam + # Set optimization hyperparameters + knetmlp_optimizerhyperparameters = Dict() + # Set the minibatch size + knetmlp_minibatchsize = 48 + # Set the max number of epochs. After training, look at the learning curve. If + # it looks like the model has not yet converged, raise maxepochs. If it looks + # like the loss has hit a plateau and you are worried about overfitting, lower + # maxepochs. + knetmlp_maxepochs = 1_000 + # Set up multilayer perceptron model + knetmlpreg = PredictMD.singlelabeldataframeknetregression( + featurenames, + labelname; + package = :Knetjl, + name = "Knet MLP", + predict = knetmlp_predict, + loss = knetmlp_loss, + losshyperparameters = knetmlp_losshyperparameters, + optimizationalgorithm = knetmlp_optimizationalgorithm, + optimizerhyperparameters = knetmlp_optimizerhyperparameters, + minibatchsize = knetmlp_minibatchsize, + modelweights = knetmlp_modelweights, + maxepochs = knetmlp_maxepochs, + printlosseverynepochs = 100, # if 0, will not print at all + feature_contrasts = feature_contrasts, + ) +end + +if get(ENV, "LOADTRAINEDMODELSFROMFILE", "") == "true" +else + # Train multilayer perceptron model on training set + PredictMD.fit!( + knetmlpreg, + training_features_df, + training_labels_df, + validation_features_df, + validation_labels_df, + ) +end + +# Plot learning curve: loss vs. epoch +knet_learningcurve_lossvsepoch = PredictMD.plotlearningcurve( + knetmlpreg, + :loss_vs_epoch; + ) +PredictMD.open_plot(knet_learningcurve_lossvsepoch) + +# Plot learning curve: loss vs. epoch, skip the first 10 epochs +knet_learningcurve_lossvsepoch_skip10epochs = PredictMD.plotlearningcurve( + knetmlpreg, + :loss_vs_epoch; + startat = 10, + endat = :end, + ) +PredictMD.open_plot(knet_learningcurve_lossvsepoch_skip10epochs) + +# Plot learning curve: loss vs. iteration +knet_learningcurve_lossvsiteration = PredictMD.plotlearningcurve( + knetmlpreg, + :loss_vs_iteration; + window = 50, + sampleevery = 10, + ) +PredictMD.open_plot(knet_learningcurve_lossvsiteration) + +# Plot learning curve: loss vs. iteration, skip the first 100 iterations +knet_learningcurve_lossvsiteration_skip100iterations = PredictMD.plotlearningcurve( + knetmlpreg, + :loss_vs_iteration; + window = 50, + sampleevery = 10, + startat = 100, + endat = :end, + ) +PredictMD.open_plot(knet_learningcurve_lossvsiteration_skip100iterations) + +# Plot true values versus predicted values for multilayer perceptron on training set +knetmlpreg_plot_training = PredictMD.plotsinglelabelregressiontrueversuspredicted( + knetmlpreg, + training_features_df, + training_labels_df, + labelname, + ) +PredictMD.open_plot(knetmlpreg_plot_training) + +# Plot true values versus predicted values for multilayer perceptron on testing set +knetmlpreg_plot_testing = PredictMD.plotsinglelabelregressiontrueversuspredicted( + knetmlpreg, + testing_features_df, + testing_labels_df, + labelname, + ) +PredictMD.open_plot(knetmlpreg_plot_testing) + +# Evaluate performance of multilayer perceptron on training set +PredictMD.singlelabelregressionmetrics( + knetmlpreg, + training_features_df, + training_labels_df, + labelname, + ) + +# Evaluate performance of multilayer perceptron on testing set +PredictMD.singlelabelregressionmetrics( + knetmlpreg, + testing_features_df, + testing_labels_df, + labelname, + ) + +############################################################################## +############################################################################## +### Section 4: Compare performance of all models ############################# +############################################################################## +############################################################################## + +all_models = PredictMD.Fittable[ + linearreg, + randomforestreg, + knetmlpreg, + ] + +# Compare performance of all five models on training set +showall(PredictMD.singlelabelregressionmetrics( + all_models, + training_features_df, + training_labels_df, + labelname, + )) + +# Compare performance of all models on testing set +showall(PredictMD.singlelabelregressionmetrics( + all_models, + testing_features_df, + testing_labels_df, + labelname, + )) + +############################################################################## +############################################################################## +### Section 5: Save trained models to file (if desired) ####################### +############################################################################## +############################################################################## + +if get(ENV, "SAVETRAINEDMODELSTOFILE", "") == "true" + PredictMD.save_model(linearreg_filename, linearreg) + PredictMD.save_model(randomforestreg_filename, randomforestreg) + PredictMD.save_model(knetmlpreg_filename, knetmlpreg) +end + +############################################################################## +############################################################################## +## Appendix A: Directly access the output of regression models ############### +############################################################################## +############################################################################## + +# We can use the PredictMD.predict() function to get the real-valued predictions +# output by each of regression models. + +# Get real-valued predictions from each model for training set +PredictMD.predict(linearreg,training_features_df,) +PredictMD.predict(randomforestreg,training_features_df,) +PredictMD.predict(knetmlpreg,training_features_df,) + +# Get real-valued predictions from each model for testing set +PredictMD.predict(linearreg,testing_features_df,) +PredictMD.predict(randomforestreg,testing_features_df,) +PredictMD.predict(knetmlpreg,testing_features_df,) + + +Base.Test.@test(isfile(ENV["linearreg_filename"])) +Base.Test.@test(isfile(ENV["randomforestreg_filename"])) +Base.Test.@test(isfile(ENV["knetmlpreg_filename"])) diff --git a/examples/breast_cancer_biopsy.jl b/examples/breast_cancer_biopsy.jl new file mode 100644 index 000000000..e02da4cd7 --- /dev/null +++ b/examples/breast_cancer_biopsy.jl @@ -0,0 +1,1709 @@ +ENV["logisticclassifier_filename"] = string(tempname(), "_logisticclassifier.jld2") +ENV["rfclassifier_filename"] = string(tempname(), "_rfclassifier.jld2") +ENV["csvc_svmclassifier_filename"] = string(tempname(), "_csvc_svmclassifier.jld2") +ENV["nusvc_svmclassifier_filename"] = string(tempname(), "_nusvc_svmclassifier.jld2") +ENV["knetmlp_filename"] = string(tempname(), "_knetmlpclassifier.jld2") + +Base.Test.@test(!isfile(ENV["logisticclassifier_filename"])) +Base.Test.@test(!isfile(ENV["rfclassifier_filename"])) +Base.Test.@test(!isfile(ENV["csvc_svmclassifier_filename"])) +Base.Test.@test(!isfile(ENV["nusvc_svmclassifier_filename"])) +Base.Test.@test(!isfile(ENV["knetmlp_filename"])) + +ENV["LOADTRAINEDMODELSFROMFILE"] = "false" +ENV["SAVETRAINEDMODELSTOFILE"] = "true" + +logisticclassifier_filename = ENV["logisticclassifier_filename"] +rfclassifier_filename = ENV["rfclassifier_filename"] +csvc_svmclassifier_filename = ENV["csvc_svmclassifier_filename"] +nusvc_svmclassifier_filename = ENV["nusvc_svmclassifier_filename"] +knetmlp_filename = ENV["knetmlp_filename"] + +############################################################################## +############################################################################## +### Section 1: Setup ######################################################### +############################################################################## +############################################################################## + +# import required packages +import PredictMD +import DataFrames +import Knet +import LIBSVM +import RDatasets +import StatsBase + +# set the seed of the global random number generator +# this makes the results reproducible +srand(999) + +############################################################################## +############################################################################## +### Section 2: Prepare data ################################################## +############################################################################## +############################################################################## + +# Import breast cancer biopsy data +df = RDatasets.dataset("MASS", "biopsy") + +# Remove rows with missing data +DataFrames.dropmissing!(df) + +# Shuffle rows +PredictMD.shuffle_rows!(df) + +# Define features +categoricalfeaturenames = Symbol[] +continuousfeaturenames = Symbol[ + :V1, + :V2, + :V3, + :V4, + :V5, + :V6, + :V7, + :V8, + :V9, + ] +featurenames = vcat(categoricalfeaturenames, continuousfeaturenames) + +if get(ENV, "LOADTRAINEDMODELSFROMFILE", "") == "true" +else + feature_contrasts = PredictMD.generate_feature_contrasts(df, featurenames) +end + +# Define labels +labelname = :Class +negativeclass = "benign" +positiveclass = "malignant" +labellevels = [negativeclass, positiveclass] + +# Put features and labels in separate dataframes +features_df = df[featurenames] +labels_df = df[[labelname]] + +# Split the data into training (50%), validation (25%), and testing (25%) +trainingandvalidation_features_df, + trainingandvalidation_labels_df, + testing_features_df, + testing_labels_df = PredictMD.split_data( + features_df, + labels_df, + 0.75, # 75% training+validation, 25% testing + ) +training_features_df, + training_labels_df, + validation_features_df, + validation_labels_df = PredictMD.split_data( + trainingandvalidation_features_df, + trainingandvalidation_labels_df, + 2/3, # 2/3 of 75% = 50% training, 1/3 of 75% = 25% validation + ) + +############################################################################## +############################################################################## +### Section 3: Apply the SMOTE algorithm to the training set ################# +############################################################################## +############################################################################## + +# Examine prevalence of each class in training set +# DataFrames.describe(training_labels_df[labelname]) +StatsBase.countmap(training_labels_df[labelname]) + +# We see that malignant is minority class and benign is majority class. +# The ratio of malignant:benign is somewhere between 1:2.5 and 1:3 (depending +# on random seed). We would like that ratio to be 1:1. We will use SMOTE +# to generate synthetic minority class samples. We will also undersample the +# minority class. The result will be a balanced training set. +majorityclass = "benign" +minorityclass = "malignant" + +smoted_training_features_df, smoted_training_labels_df = PredictMD.smote( + training_features_df, + training_labels_df, + featurenames, + labelname; + majorityclass = majorityclass, + minorityclass = minorityclass, + pct_over = 100, # how much to oversample the minority class + minority_to_majority_ratio = 1.0, # desired minority:majority ratio + k = 5, + ) + +# Examine prevalence of each class in smoted training set +# DataFrames.describe(smoted_training_labels_df[labelname]) +StatsBase.countmap(smoted_training_labels_df[labelname]) + +# Now we have a ratio of malignant:benign that is 1:1. + +############################################################################## +############################################################################## +### Section 4: Set up and train models ####################################### +############################################################################## +############################################################################## + +############################################################################## +## Logistic "regression" classifier ########################################## +############################################################################## + +if get(ENV, "LOADTRAINEDMODELSFROMFILE", "") == "true" + logisticclassifier = PredictMD.load_model(logisticclassifier_filename) +else + # Set up logistic classifier model + logisticclassifier = PredictMD.singlelabelbinaryclassdataframelogisticclassifier( + featurenames, + labelname, + labellevels; + package = :GLMjl, + intercept = true, # optional, defaults to true + interactions = 1, # optional, defaults to 1 + name = "Logistic regression", # optional + ) +end + +if get(ENV, "LOADTRAINEDMODELSFROMFILE", "") == "true" +else + # Train logistic classifier model on smoted training set + PredictMD.fit!( + logisticclassifier, + smoted_training_features_df, + smoted_training_labels_df, + ) +end + +# View coefficients, p values, etc. for underlying logistic regression +PredictMD.get_underlying(logisticclassifier) + +# Plot classifier histogram for logistic classifier on smoted training set +logistic_hist_training = PredictMD.plotsinglelabelbinaryclassifierhistogram( + logisticclassifier, + smoted_training_features_df, + smoted_training_labels_df, + labelname, + labellevels, + ) +PredictMD.open_plot(logistic_hist_training) + +# Plot classifier histogram for logistic classifier on testing set +logistic_hist_testing = PredictMD.plotsinglelabelbinaryclassifierhistogram( + logisticclassifier, + testing_features_df, + testing_labels_df, + labelname, + labellevels, + ) +PredictMD.open_plot(logistic_hist_testing) + +# Evaluate performance of logistic classifier on smoted training set +PredictMD.singlelabelbinaryclassificationmetrics( + logisticclassifier, + testing_features_df, + testing_labels_df, + labelname, + positiveclass; + sensitivity = 0.95, + ) + +# Evaluate performance of logistic classifier on testing set +PredictMD.singlelabelbinaryclassificationmetrics( + logisticclassifier, + testing_features_df, + testing_labels_df, + labelname, + positiveclass; + sensitivity = 0.95, + ) + +logistic_calibration_curve = PredictMD.plot_probability_calibration_curve( + logisticclassifier, + smoted_training_features_df, + smoted_training_labels_df, + labelname, + positiveclass; + window = 0.2, + ) +PredictMD.open_plot(logistic_calibration_curve) + +PredictMD.probability_calibration_metrics( + logisticclassifier, + testing_features_df, + testing_labels_df, + labelname, + positiveclass; + window = 0.1, + ) + +logistic_cutoffs, logistic_risk_group_prevalences = PredictMD.risk_score_cutoff_values( + logisticclassifier, + testing_features_df, + testing_labels_df, + labelname, + positiveclass; + average_function = mean, + ) +println( + string( + "Low risk: 0 to $(logistic_cutoffs[1]).", + " Medium risk: $(logistic_cutoffs[1]) to $(logistic_cutoffs[2]).", + " High risk: $(logistic_cutoffs[2]) to 1.", + ) + ) +showall(logistic_risk_group_prevalences) +logistic_cutoffs, logistic_risk_group_prevalences = PredictMD.risk_score_cutoff_values( + logisticclassifier, + testing_features_df, + testing_labels_df, + labelname, + positiveclass; + average_function = median, + ) +println( + string( + "Low risk: 0 to $(logistic_cutoffs[1]).", + " Medium risk: $(logistic_cutoffs[1]) to $(logistic_cutoffs[2]).", + " High risk: $(logistic_cutoffs[2]) to 1.", + ) + ) +showall(logistic_risk_group_prevalences) + +############################################################################## +## Random forest classifier ################################################## +############################################################################## + +if get(ENV, "LOADTRAINEDMODELSFROMFILE", "") == "true" + rfclassifier = PredictMD.load_model(rfclassifier_filename) +else + # Set up random forest classifier model + rfclassifier = PredictMD.singlelabelmulticlassdataframerandomforestclassifier( + featurenames, + labelname, + labellevels; + nsubfeatures = 4, # number of subfeatures; defaults to 2 + ntrees = 200, # number of trees; defaults to 10 + package = :DecisionTreejl, + name = "Random forest", # optional + feature_contrasts = feature_contrasts, + ) +end + +if get(ENV, "LOADTRAINEDMODELSFROMFILE", "") == "true" +else + # Train random forest classifier model on smoted training set + PredictMD.fit!( + rfclassifier, + smoted_training_features_df, + smoted_training_labels_df, + ) +end + +# Plot classifier histogram for random forest classifier on smoted training set +rfclassifier_hist_training = PredictMD.plotsinglelabelbinaryclassifierhistogram( + rfclassifier, + smoted_training_features_df, + smoted_training_labels_df, + labelname, + labellevels, + ) +PredictMD.open_plot(rfclassifier_hist_training) + +# Plot classifier histogram for random forest classifier on testing set +rfclassifier_hist_testing = PredictMD.plotsinglelabelbinaryclassifierhistogram( + rfclassifier, + testing_features_df, + testing_labels_df, + labelname, + labellevels, + ) +PredictMD.open_plot(rfclassifier_hist_testing) + +# Evaluate performance of random forest classifier on smoted training set +PredictMD.singlelabelbinaryclassificationmetrics( + rfclassifier, + smoted_training_features_df, + smoted_training_labels_df, + labelname, + positiveclass; + sensitivity = 0.95, + ) + +# Evaluate performance of random forest on testing set +PredictMD.singlelabelbinaryclassificationmetrics( + rfclassifier, + testing_features_df, + testing_labels_df, + labelname, + positiveclass; + sensitivity = 0.95, + ) + +rf_calibration_curve = PredictMD.plot_probability_calibration_curve( + rfclassifier, + testing_features_df, + testing_labels_df, + labelname, + positiveclass; + window = 0.1, + ) +PredictMD.open_plot(rf_calibration_curve) + +############################################################################## +## Support vector machine (C support vector classifier) ###################### +############################################################################## + +if get(ENV, "LOADTRAINEDMODELSFROMFILE", "") == "true" + csvc_svmclassifier = PredictMD.load_model(csvc_svmclassifier_filename) +else + # Set up C-SVC model + csvc_svmclassifier = PredictMD.singlelabelmulticlassdataframesvmclassifier( + featurenames, + labelname, + labellevels; + package = :LIBSVMjl, + svmtype = LIBSVM.SVC, + name = "SVM (C-SVC)", + verbose = false, + feature_contrasts = feature_contrasts, + ) +end + +if get(ENV, "LOADTRAINEDMODELSFROMFILE", "") == "true" +else + # Train C-SVC model on smoted training set + PredictMD.fit!( + csvc_svmclassifier, + smoted_training_features_df, + smoted_training_labels_df, + ) +end + +# Plot classifier histogram for C-SVC on smoted training set +csvc_svmclassifier_hist_training = PredictMD.plotsinglelabelbinaryclassifierhistogram( + csvc_svmclassifier, + smoted_training_features_df, + smoted_training_labels_df, + labelname, + labellevels, + ) +PredictMD.open_plot(csvc_svmclassifier_hist_training) + +# Plot classifier histogram for C-SVC on testing set +csvc_svmclassifier_hist_testing = PredictMD.plotsinglelabelbinaryclassifierhistogram( + csvc_svmclassifier, + testing_features_df, + testing_labels_df, + labelname, + labellevels, + ) +PredictMD.open_plot(csvc_svmclassifier_hist_testing) + +# Evaluate performance of C-SVC on smoted training set +PredictMD.singlelabelbinaryclassificationmetrics( + csvc_svmclassifier, + smoted_training_features_df, + smoted_training_labels_df, + labelname, + positiveclass; + sensitivity = 0.95, + ) + +# Evaluate performance of C-SVC on testing set +PredictMD.singlelabelbinaryclassificationmetrics( + csvc_svmclassifier, + testing_features_df, + testing_labels_df, + labelname, + positiveclass; + sensitivity = 0.95, + ) + +############################################################################## +## Support vector machine (nu support vector classifier) ##################### +############################################################################## + +if get(ENV, "LOADTRAINEDMODELSFROMFILE", "") == "true" + nusvc_svmclassifier = PredictMD.load_model(nusvc_svmclassifier_filename) +else + # Set up nu-SVC model + nusvc_svmclassifier = PredictMD.singlelabelmulticlassdataframesvmclassifier( + featurenames, + labelname, + labellevels; + package = :LIBSVMjl, + svmtype = LIBSVM.NuSVC, + name = "SVM (nu-SVC)", + verbose = false, + feature_contrasts = feature_contrasts, + ) +end + +if get(ENV, "LOADTRAINEDMODELSFROMFILE", "") == "true" +else + # Train nu-SVC model on smoted training set + PredictMD.fit!( + nusvc_svmclassifier, + smoted_training_features_df, + smoted_training_labels_df, + ) +end + +# Plot classifier histogram for nu-SVC on smoted training set +nusvc_svmclassifier_hist_training = PredictMD.plotsinglelabelbinaryclassifierhistogram( + nusvc_svmclassifier, + smoted_training_features_df, + smoted_training_labels_df, + labelname, + labellevels, + ) +PredictMD.open_plot(nusvc_svmclassifier_hist_training) + +# Plot classifier histogram for nu-SVC on testing set +nusvc_svmclassifier_hist_testing = PredictMD.plotsinglelabelbinaryclassifierhistogram( + nusvc_svmclassifier, + testing_features_df, + testing_labels_df, + labelname, + labellevels, + ) +PredictMD.open_plot(nusvc_svmclassifier_hist_testing) + +# Evaluate performance of nu-SVC on smoted training set +PredictMD.singlelabelbinaryclassificationmetrics( + nusvc_svmclassifier, + smoted_training_features_df, + smoted_training_labels_df, + labelname, + positiveclass; + sensitivity = 0.95, + ) + +# Evaluate performance of SVM on testing set +PredictMD.singlelabelbinaryclassificationmetrics( + nusvc_svmclassifier, + testing_features_df, + testing_labels_df, + labelname, + positiveclass; + sensitivity = 0.95, + ) + +############################################################################## +## Multilayer perceptron (i.e. fully connected feedforward neural network) ### +############################################################################## + +# Define predict function +function knetmlp_predict( + w, # don't put a type annotation on this + x0::AbstractArray; + probabilities::Bool = true, + ) + # x0 = input layer + # x1 = first hidden layer + x1 = Knet.relu.( w[1]*x0 .+ w[2] ) # w[1] = weights, w[2] = biases + # x2 = second hidden layer + x2 = Knet.relu.( w[3]*x1 .+ w[4] ) # w[3] = weights, w[4] = biases + # x3 = output layer + x3 = w[5]*x2 .+ w[6] # w[5] = weights, w[6] = biases + unnormalizedlogprobs = x3 + if probabilities + normalizedlogprobs = Knet.logp(unnormalizedlogprobs, 1) + normalizedprobs = exp.(normalizedlogprobs) + return normalizedprobs + else + return unnormalizedlogprobs + end +end + +# Define loss function +function knetmlp_loss( + predict::Function, + modelweights, # don't put a type annotation on this + x::AbstractArray, + ytrue::AbstractArray; + L1::Real = Cfloat(0), + L2::Real = Cfloat(0), + ) + loss = Knet.nll( + predict( + modelweights, + x; + probabilities = false, + ), + ytrue, + 1, # d = 1 means that instances are in columns + ) + if L1 != 0 + loss += L1 * sum(sum(abs, w_i) for w_i in modelweights[1:2:end]) + end + if L2 != 0 + loss += L2 * sum(sum(abs2, w_i) for w_i in modelweights[1:2:end]) + end + return loss +end + +if get(ENV, "LOADTRAINEDMODELSFROMFILE", "") == "true" + knetmlpclassifier = PredictMD.load_model(knetmlp_filename) +else + # Randomly initialize model weights + knetmlp_modelweights = Any[ + # input layer has dimension contrasts.num_array_columns + # + # first hidden layer (64 neurons): + Cfloat.( + 0.1f0*randn(Cfloat,64,feature_contrasts.num_array_columns) # weights + ), + Cfloat.( + zeros(Cfloat,64,1) # biases + ), + # + # second hidden layer (32 neurons): + Cfloat.( + 0.1f0*randn(Cfloat,32,64) # weights + ), + Cfloat.( + zeros(Cfloat,32,1) # biases + ), + # + # output layer (number of neurons == number of classes): + Cfloat.( + 0.1f0*randn(Cfloat,2,32) # weights + ), + Cfloat.( + zeros(Cfloat,2,1) # biases + ), + ] + # Define loss hyperparameters + knetmlp_losshyperparameters = Dict() + knetmlp_losshyperparameters[:L1] = Cfloat(0.0) + knetmlp_losshyperparameters[:L2] = Cfloat(0.0) + # Select optimization algorithm + knetmlp_optimizationalgorithm = :Momentum + # Set optimization hyperparameters + knetmlp_optimizerhyperparameters = Dict() + # Set the minibatch size + knetmlp_minibatchsize = 48 + # Set the max number of epochs. After training, look at the learning curve. If + # it looks like the model has not yet converged, raise maxepochs. If it looks + # like the loss has hit a plateau and you are worried about overfitting, lower + # maxepochs. + knetmlp_maxepochs = 1_000 + # Set up multilayer perceptron model + knetmlpclassifier = PredictMD.singlelabelmulticlassdataframeknetclassifier( + featurenames, + labelname, + labellevels; + package = :Knetjl, + name = "Knet MLP", + predict = knetmlp_predict, + loss = knetmlp_loss, + losshyperparameters = knetmlp_losshyperparameters, + optimizationalgorithm = knetmlp_optimizationalgorithm, + optimizerhyperparameters = knetmlp_optimizerhyperparameters, + minibatchsize = knetmlp_minibatchsize, + modelweights = knetmlp_modelweights, + printlosseverynepochs = 100, # if 0, will not print at all + maxepochs = knetmlp_maxepochs, + feature_contrasts = feature_contrasts, + ) +end + +if get(ENV, "LOADTRAINEDMODELSFROMFILE", "") == "true" +else + # Train multilayer perceptron model on training set + PredictMD.fit!( + knetmlpclassifier, + smoted_training_features_df, + smoted_training_labels_df, + validation_features_df, + validation_labels_df, + ) +end + +# Plot learning curve: loss vs. epoch +knet_learningcurve_lossvsepoch = PredictMD.plotlearningcurve( + knetmlpclassifier, + :loss_vs_epoch; + ) +PredictMD.open_plot(knet_learningcurve_lossvsepoch) + +# Plot learning curve: loss vs. epoch, skip the first 10 epochs +knet_learningcurve_lossvsepoch_skip10epochs = PredictMD.plotlearningcurve( + knetmlpclassifier, + :loss_vs_epoch; + startat = 10, + endat = :end, + ) +PredictMD.open_plot(knet_learningcurve_lossvsepoch_skip10epochs) + +# Plot learning curve: loss vs. iteration +knet_learningcurve_lossvsiteration = PredictMD.plotlearningcurve( + knetmlpclassifier, + :loss_vs_iteration; + window = 50, + sampleevery = 10, + ) +PredictMD.open_plot(knet_learningcurve_lossvsiteration) + +# Plot learning curve: loss vs. iteration, skip the first 100 iterations +knet_learningcurve_lossvsiteration_skip100iterations = PredictMD.plotlearningcurve( + knetmlpclassifier, + :loss_vs_iteration; + window = 50, + sampleevery = 10, + startat = 100, + endat = :end, + ) +PredictMD.open_plot(knet_learningcurve_lossvsiteration_skip100iterations) + +# Plot classifier histogram for multilayer perceptron on smoted training set +knetmlpclassifier_hist_training = PredictMD.plotsinglelabelbinaryclassifierhistogram( + knetmlpclassifier, + smoted_training_features_df, + smoted_training_labels_df, + labelname, + labellevels, + ) +PredictMD.open_plot(knetmlpclassifier_hist_training) + +# Plot classifier histogram for multilayer perceptron on testing set +knetmlpclassifier_hist_testing = PredictMD.plotsinglelabelbinaryclassifierhistogram( + knetmlpclassifier, + testing_features_df, + testing_labels_df, + labelname, + labellevels, + ) +PredictMD.open_plot(knetmlpclassifier_hist_testing) + +# Evaluate performance of multilayer perceptron on smoted training set +PredictMD.singlelabelbinaryclassificationmetrics( + knetmlpclassifier, + smoted_training_features_df, + smoted_training_labels_df, + labelname, + positiveclass; + sensitivity = 0.95, + ) + +# Evaluate performance of multilayer perceptron on testing set +PredictMD.singlelabelbinaryclassificationmetrics( + knetmlpclassifier, + testing_features_df, + testing_labels_df, + labelname, + positiveclass; + sensitivity = 0.95, + ) + +############################################################################## +############################################################################## +## Section 5: Compare performance of all models ############################## +############################################################################## +############################################################################## + +all_models = PredictMD.Fittable[ + logisticclassifier, + rfclassifier, + csvc_svmclassifier, + nusvc_svmclassifier, + knetmlpclassifier, + ] + +# Compare performance of all models on smoted training set +showall(PredictMD.singlelabelbinaryclassificationmetrics( + all_models, + training_features_df, + training_labels_df, + labelname, + positiveclass; + sensitivity = 0.95, + )) +showall(PredictMD.singlelabelbinaryclassificationmetrics( + all_models, + training_features_df, + training_labels_df, + labelname, + positiveclass; + specificity = 0.95, + )) +showall(PredictMD.singlelabelbinaryclassificationmetrics( + all_models, + training_features_df, + training_labels_df, + labelname, + positiveclass; + maximize = :f1score, + )) +showall(PredictMD.singlelabelbinaryclassificationmetrics( + all_models, + training_features_df, + training_labels_df, + labelname, + positiveclass; + maximize = :cohen_kappa, + )) + +# Compare performance of all models on testing set +showall(PredictMD.singlelabelbinaryclassificationmetrics( + all_models, + testing_features_df, + testing_labels_df, + labelname, + positiveclass; + sensitivity = 0.95, + )) +showall(PredictMD.singlelabelbinaryclassificationmetrics( + all_models, + testing_features_df, + testing_labels_df, + labelname, + positiveclass; + specificity = 0.95, + )) +showall(PredictMD.singlelabelbinaryclassificationmetrics( + all_models, + testing_features_df, + testing_labels_df, + labelname, + positiveclass; + maximize = :f1score, + )) +showall(PredictMD.singlelabelbinaryclassificationmetrics( + all_models, + testing_features_df, + testing_labels_df, + labelname, + positiveclass; + maximize = :cohen_kappa, + )) + +# Plot receiver operating characteristic curves for all models on testing set. +rocplottesting = PredictMD.plotroccurves( + all_models, + testing_features_df, + testing_labels_df, + labelname, + positiveclass, + ) +PredictMD.open_plot(rocplottesting) + +# Plot precision-recall curves for all models on testing set. +prplottesting = PredictMD.plotprcurves( + all_models, + testing_features_df, + testing_labels_df, + labelname, + positiveclass, + ) +PredictMD.open_plot(prplottesting) + +############################################################################## +############################################################################## +### Section 6: Save trained models to file (if desired) ####################### +############################################################################## +############################################################################## + +if get(ENV, "SAVETRAINEDMODELSTOFILE", "") == "true" + PredictMD.save_model(logisticclassifier_filename, logisticclassifier) + PredictMD.save_model(rfclassifier_filename, rfclassifier) + PredictMD.save_model(csvc_svmclassifier_filename, csvc_svmclassifier) + PredictMD.save_model(nusvc_svmclassifier_filename, nusvc_svmclassifier) + PredictMD.save_model(knetmlp_filename, knetmlpclassifier) +end + +############################################################################## +############################################################################## +## Appendix A: Directly access the output of classification models ########### +############################################################################## +############################################################################## + +# We can use the PredictMD.predict_proba() function to get the probabilities output +# by each of the classification models. + +# Get probabilities from each model for smoted training set +PredictMD.predict_proba(logisticclassifier,smoted_training_features_df,) +PredictMD.predict_proba(rfclassifier,smoted_training_features_df,) +PredictMD.predict_proba(csvc_svmclassifier,smoted_training_features_df,) +PredictMD.predict_proba(nusvc_svmclassifier,smoted_training_features_df,) +PredictMD.predict_proba(knetmlpclassifier,smoted_training_features_df,) + +# Get probabilities from each model for testing set +PredictMD.predict_proba(logisticclassifier,testing_features_df,) +PredictMD.predict_proba(rfclassifier,testing_features_df,) +PredictMD.predict_proba(csvc_svmclassifier,testing_features_df,) +PredictMD.predict_proba(nusvc_svmclassifier,testing_features_df,) +PredictMD.predict_proba(knetmlpclassifier,testing_features_df,) + +# If we want to get predicted classes instead of probabilities, we can use the +# PredictMD.predict() function to get the class predictions output by each of the +# classification models. For each sample, PredictMD.predict() will select the class +# with the highest probability. In the case of binary classification, this is +# equivalent to using a threshold of 0.5. + +# Get class predictions from each model for smoted training set +PredictMD.predict(logisticclassifier,smoted_training_features_df,) +PredictMD.predict(rfclassifier,smoted_training_features_df,) +PredictMD.predict(csvc_svmclassifier,smoted_training_features_df,) +PredictMD.predict(nusvc_svmclassifier,smoted_training_features_df,) +PredictMD.predict(knetmlpclassifier,smoted_training_features_df,) + +# Get class predictions from each model for testing set +PredictMD.predict(logisticclassifier,testing_features_df,) +PredictMD.predict(rfclassifier,testing_features_df,) +PredictMD.predict(csvc_svmclassifier,testing_features_df,) +PredictMD.predict(nusvc_svmclassifier,testing_features_df,) +PredictMD.predict(knetmlpclassifier,testing_features_df,) + +Base.Test.@test(isfile(ENV["logisticclassifier_filename"])) +Base.Test.@test(isfile(ENV["rfclassifier_filename"])) +Base.Test.@test(isfile(ENV["csvc_svmclassifier_filename"])) +Base.Test.@test(isfile(ENV["nusvc_svmclassifier_filename"])) +Base.Test.@test(isfile(ENV["knetmlp_filename"])) + +ENV["LOADTRAINEDMODELSFROMFILE"] = "true" +ENV["SAVETRAINEDMODELSTOFILE"] = "false" + +logisticclassifier_filename = ENV["logisticclassifier_filename"] +rfclassifier_filename = ENV["rfclassifier_filename"] +csvc_svmclassifier_filename = ENV["csvc_svmclassifier_filename"] +nusvc_svmclassifier_filename = ENV["nusvc_svmclassifier_filename"] +knetmlp_filename = ENV["knetmlp_filename"] + +############################################################################## +############################################################################## +### Section 1: Setup ######################################################### +############################################################################## +############################################################################## + +# import required packages +import PredictMD +import DataFrames +import Knet +import LIBSVM +import RDatasets +import StatsBase + +# set the seed of the global random number generator +# this makes the results reproducible +srand(999) + +############################################################################## +############################################################################## +### Section 2: Prepare data ################################################## +############################################################################## +############################################################################## + +# Import breast cancer biopsy data +df = RDatasets.dataset("MASS", "biopsy") + +# Remove rows with missing data +DataFrames.dropmissing!(df) + +# Shuffle rows +PredictMD.shuffle_rows!(df) + +# Define features +categoricalfeaturenames = Symbol[] +continuousfeaturenames = Symbol[ + :V1, + :V2, + :V3, + :V4, + :V5, + :V6, + :V7, + :V8, + :V9, + ] +featurenames = vcat(categoricalfeaturenames, continuousfeaturenames) + +if get(ENV, "LOADTRAINEDMODELSFROMFILE", "") == "true" +else + feature_contrasts = PredictMD.generate_feature_contrasts(df, featurenames) +end + +# Define labels +labelname = :Class +negativeclass = "benign" +positiveclass = "malignant" +labellevels = [negativeclass, positiveclass] + +# Put features and labels in separate dataframes +features_df = df[featurenames] +labels_df = df[[labelname]] + +# Split the data into training (50%), validation (25%), and testing (25%) +trainingandvalidation_features_df, + trainingandvalidation_labels_df, + testing_features_df, + testing_labels_df = PredictMD.split_data( + features_df, + labels_df, + 0.75, # 75% training+validation, 25% testing + ) +training_features_df, + training_labels_df, + validation_features_df, + validation_labels_df = PredictMD.split_data( + trainingandvalidation_features_df, + trainingandvalidation_labels_df, + 2/3, # 2/3 of 75% = 50% training, 1/3 of 75% = 25% validation + ) + +############################################################################## +############################################################################## +### Section 3: Apply the SMOTE algorithm to the training set ################# +############################################################################## +############################################################################## + +# Examine prevalence of each class in training set +# DataFrames.describe(training_labels_df[labelname]) +StatsBase.countmap(training_labels_df[labelname]) + +# We see that malignant is minority class and benign is majority class. +# The ratio of malignant:benign is somewhere between 1:2.5 and 1:3 (depending +# on random seed). We would like that ratio to be 1:1. We will use SMOTE +# to generate synthetic minority class samples. We will also undersample the +# minority class. The result will be a balanced training set. +majorityclass = "benign" +minorityclass = "malignant" + +smoted_training_features_df, smoted_training_labels_df = PredictMD.smote( + training_features_df, + training_labels_df, + featurenames, + labelname; + majorityclass = majorityclass, + minorityclass = minorityclass, + pct_over = 100, # how much to oversample the minority class + minority_to_majority_ratio = 1.0, # desired minority:majority ratio + k = 5, + ) + +# Examine prevalence of each class in smoted training set +# DataFrames.describe(smoted_training_labels_df[labelname]) +StatsBase.countmap(smoted_training_labels_df[labelname]) + +# Now we have a ratio of malignant:benign that is 1:1. + +############################################################################## +############################################################################## +### Section 4: Set up and train models ####################################### +############################################################################## +############################################################################## + +############################################################################## +## Logistic "regression" classifier ########################################## +############################################################################## + +if get(ENV, "LOADTRAINEDMODELSFROMFILE", "") == "true" + logisticclassifier = PredictMD.load_model(logisticclassifier_filename) +else + # Set up logistic classifier model + logisticclassifier = PredictMD.singlelabelbinaryclassdataframelogisticclassifier( + featurenames, + labelname, + labellevels; + package = :GLMjl, + intercept = true, # optional, defaults to true + interactions = 1, # optional, defaults to 1 + name = "Logistic regression", # optional + ) +end + +if get(ENV, "LOADTRAINEDMODELSFROMFILE", "") == "true" +else + # Train logistic classifier model on smoted training set + PredictMD.fit!( + logisticclassifier, + smoted_training_features_df, + smoted_training_labels_df, + ) +end + +# View coefficients, p values, etc. for underlying logistic regression +PredictMD.get_underlying(logisticclassifier) + +# Plot classifier histogram for logistic classifier on smoted training set +logistic_hist_training = PredictMD.plotsinglelabelbinaryclassifierhistogram( + logisticclassifier, + smoted_training_features_df, + smoted_training_labels_df, + labelname, + labellevels, + ) +PredictMD.open_plot(logistic_hist_training) + +# Plot classifier histogram for logistic classifier on testing set +logistic_hist_testing = PredictMD.plotsinglelabelbinaryclassifierhistogram( + logisticclassifier, + testing_features_df, + testing_labels_df, + labelname, + labellevels, + ) +PredictMD.open_plot(logistic_hist_testing) + +# Evaluate performance of logistic classifier on smoted training set +PredictMD.singlelabelbinaryclassificationmetrics( + logisticclassifier, + testing_features_df, + testing_labels_df, + labelname, + positiveclass; + sensitivity = 0.95, + ) + +# Evaluate performance of logistic classifier on testing set +PredictMD.singlelabelbinaryclassificationmetrics( + logisticclassifier, + testing_features_df, + testing_labels_df, + labelname, + positiveclass; + sensitivity = 0.95, + ) + +logistic_calibration_curve = PredictMD.plot_probability_calibration_curve( + logisticclassifier, + smoted_training_features_df, + smoted_training_labels_df, + labelname, + positiveclass; + window = 0.2, + ) +PredictMD.open_plot(logistic_calibration_curve) + +PredictMD.probability_calibration_metrics( + logisticclassifier, + testing_features_df, + testing_labels_df, + labelname, + positiveclass; + window = 0.1, + ) + +logistic_cutoffs, logistic_risk_group_prevalences = PredictMD.risk_score_cutoff_values( + logisticclassifier, + testing_features_df, + testing_labels_df, + labelname, + positiveclass; + average_function = mean, + ) +println( + string( + "Low risk: 0 to $(logistic_cutoffs[1]).", + " Medium risk: $(logistic_cutoffs[1]) to $(logistic_cutoffs[2]).", + " High risk: $(logistic_cutoffs[2]) to 1.", + ) + ) +showall(logistic_risk_group_prevalences) +logistic_cutoffs, logistic_risk_group_prevalences = PredictMD.risk_score_cutoff_values( + logisticclassifier, + testing_features_df, + testing_labels_df, + labelname, + positiveclass; + average_function = median, + ) +println( + string( + "Low risk: 0 to $(logistic_cutoffs[1]).", + " Medium risk: $(logistic_cutoffs[1]) to $(logistic_cutoffs[2]).", + " High risk: $(logistic_cutoffs[2]) to 1.", + ) + ) +showall(logistic_risk_group_prevalences) + +############################################################################## +## Random forest classifier ################################################## +############################################################################## + +if get(ENV, "LOADTRAINEDMODELSFROMFILE", "") == "true" + rfclassifier = PredictMD.load_model(rfclassifier_filename) +else + # Set up random forest classifier model + rfclassifier = PredictMD.singlelabelmulticlassdataframerandomforestclassifier( + featurenames, + labelname, + labellevels; + nsubfeatures = 4, # number of subfeatures; defaults to 2 + ntrees = 200, # number of trees; defaults to 10 + package = :DecisionTreejl, + name = "Random forest", # optional + feature_contrasts = feature_contrasts, + ) +end + +if get(ENV, "LOADTRAINEDMODELSFROMFILE", "") == "true" +else + # Train random forest classifier model on smoted training set + PredictMD.fit!( + rfclassifier, + smoted_training_features_df, + smoted_training_labels_df, + ) +end + +# Plot classifier histogram for random forest classifier on smoted training set +rfclassifier_hist_training = PredictMD.plotsinglelabelbinaryclassifierhistogram( + rfclassifier, + smoted_training_features_df, + smoted_training_labels_df, + labelname, + labellevels, + ) +PredictMD.open_plot(rfclassifier_hist_training) + +# Plot classifier histogram for random forest classifier on testing set +rfclassifier_hist_testing = PredictMD.plotsinglelabelbinaryclassifierhistogram( + rfclassifier, + testing_features_df, + testing_labels_df, + labelname, + labellevels, + ) +PredictMD.open_plot(rfclassifier_hist_testing) + +# Evaluate performance of random forest classifier on smoted training set +PredictMD.singlelabelbinaryclassificationmetrics( + rfclassifier, + smoted_training_features_df, + smoted_training_labels_df, + labelname, + positiveclass; + sensitivity = 0.95, + ) + +# Evaluate performance of random forest on testing set +PredictMD.singlelabelbinaryclassificationmetrics( + rfclassifier, + testing_features_df, + testing_labels_df, + labelname, + positiveclass; + sensitivity = 0.95, + ) + +rf_calibration_curve = PredictMD.plot_probability_calibration_curve( + rfclassifier, + testing_features_df, + testing_labels_df, + labelname, + positiveclass; + window = 0.1, + ) +PredictMD.open_plot(rf_calibration_curve) + +############################################################################## +## Support vector machine (C support vector classifier) ###################### +############################################################################## + +if get(ENV, "LOADTRAINEDMODELSFROMFILE", "") == "true" + csvc_svmclassifier = PredictMD.load_model(csvc_svmclassifier_filename) +else + # Set up C-SVC model + csvc_svmclassifier = PredictMD.singlelabelmulticlassdataframesvmclassifier( + featurenames, + labelname, + labellevels; + package = :LIBSVMjl, + svmtype = LIBSVM.SVC, + name = "SVM (C-SVC)", + verbose = false, + feature_contrasts = feature_contrasts, + ) +end + +if get(ENV, "LOADTRAINEDMODELSFROMFILE", "") == "true" +else + # Train C-SVC model on smoted training set + PredictMD.fit!( + csvc_svmclassifier, + smoted_training_features_df, + smoted_training_labels_df, + ) +end + +# Plot classifier histogram for C-SVC on smoted training set +csvc_svmclassifier_hist_training = PredictMD.plotsinglelabelbinaryclassifierhistogram( + csvc_svmclassifier, + smoted_training_features_df, + smoted_training_labels_df, + labelname, + labellevels, + ) +PredictMD.open_plot(csvc_svmclassifier_hist_training) + +# Plot classifier histogram for C-SVC on testing set +csvc_svmclassifier_hist_testing = PredictMD.plotsinglelabelbinaryclassifierhistogram( + csvc_svmclassifier, + testing_features_df, + testing_labels_df, + labelname, + labellevels, + ) +PredictMD.open_plot(csvc_svmclassifier_hist_testing) + +# Evaluate performance of C-SVC on smoted training set +PredictMD.singlelabelbinaryclassificationmetrics( + csvc_svmclassifier, + smoted_training_features_df, + smoted_training_labels_df, + labelname, + positiveclass; + sensitivity = 0.95, + ) + +# Evaluate performance of C-SVC on testing set +PredictMD.singlelabelbinaryclassificationmetrics( + csvc_svmclassifier, + testing_features_df, + testing_labels_df, + labelname, + positiveclass; + sensitivity = 0.95, + ) + +############################################################################## +## Support vector machine (nu support vector classifier) ##################### +############################################################################## + +if get(ENV, "LOADTRAINEDMODELSFROMFILE", "") == "true" + nusvc_svmclassifier = PredictMD.load_model(nusvc_svmclassifier_filename) +else + # Set up nu-SVC model + nusvc_svmclassifier = PredictMD.singlelabelmulticlassdataframesvmclassifier( + featurenames, + labelname, + labellevels; + package = :LIBSVMjl, + svmtype = LIBSVM.NuSVC, + name = "SVM (nu-SVC)", + verbose = false, + feature_contrasts = feature_contrasts, + ) +end + +if get(ENV, "LOADTRAINEDMODELSFROMFILE", "") == "true" +else + # Train nu-SVC model on smoted training set + PredictMD.fit!( + nusvc_svmclassifier, + smoted_training_features_df, + smoted_training_labels_df, + ) +end + +# Plot classifier histogram for nu-SVC on smoted training set +nusvc_svmclassifier_hist_training = PredictMD.plotsinglelabelbinaryclassifierhistogram( + nusvc_svmclassifier, + smoted_training_features_df, + smoted_training_labels_df, + labelname, + labellevels, + ) +PredictMD.open_plot(nusvc_svmclassifier_hist_training) + +# Plot classifier histogram for nu-SVC on testing set +nusvc_svmclassifier_hist_testing = PredictMD.plotsinglelabelbinaryclassifierhistogram( + nusvc_svmclassifier, + testing_features_df, + testing_labels_df, + labelname, + labellevels, + ) +PredictMD.open_plot(nusvc_svmclassifier_hist_testing) + +# Evaluate performance of nu-SVC on smoted training set +PredictMD.singlelabelbinaryclassificationmetrics( + nusvc_svmclassifier, + smoted_training_features_df, + smoted_training_labels_df, + labelname, + positiveclass; + sensitivity = 0.95, + ) + +# Evaluate performance of SVM on testing set +PredictMD.singlelabelbinaryclassificationmetrics( + nusvc_svmclassifier, + testing_features_df, + testing_labels_df, + labelname, + positiveclass; + sensitivity = 0.95, + ) + +############################################################################## +## Multilayer perceptron (i.e. fully connected feedforward neural network) ### +############################################################################## + +# Define predict function +function knetmlp_predict( + w, # don't put a type annotation on this + x0::AbstractArray; + probabilities::Bool = true, + ) + # x0 = input layer + # x1 = first hidden layer + x1 = Knet.relu.( w[1]*x0 .+ w[2] ) # w[1] = weights, w[2] = biases + # x2 = second hidden layer + x2 = Knet.relu.( w[3]*x1 .+ w[4] ) # w[3] = weights, w[4] = biases + # x3 = output layer + x3 = w[5]*x2 .+ w[6] # w[5] = weights, w[6] = biases + unnormalizedlogprobs = x3 + if probabilities + normalizedlogprobs = Knet.logp(unnormalizedlogprobs, 1) + normalizedprobs = exp.(normalizedlogprobs) + return normalizedprobs + else + return unnormalizedlogprobs + end +end + +# Define loss function +function knetmlp_loss( + predict::Function, + modelweights, # don't put a type annotation on this + x::AbstractArray, + ytrue::AbstractArray; + L1::Real = Cfloat(0), + L2::Real = Cfloat(0), + ) + loss = Knet.nll( + predict( + modelweights, + x; + probabilities = false, + ), + ytrue, + 1, # d = 1 means that instances are in columns + ) + if L1 != 0 + loss += L1 * sum(sum(abs, w_i) for w_i in modelweights[1:2:end]) + end + if L2 != 0 + loss += L2 * sum(sum(abs2, w_i) for w_i in modelweights[1:2:end]) + end + return loss +end + +if get(ENV, "LOADTRAINEDMODELSFROMFILE", "") == "true" + knetmlpclassifier = PredictMD.load_model(knetmlp_filename) +else + # Randomly initialize model weights + knetmlp_modelweights = Any[ + # input layer has dimension contrasts.num_array_columns + # + # first hidden layer (64 neurons): + Cfloat.( + 0.1f0*randn(Cfloat,64,feature_contrasts.num_array_columns) # weights + ), + Cfloat.( + zeros(Cfloat,64,1) # biases + ), + # + # second hidden layer (32 neurons): + Cfloat.( + 0.1f0*randn(Cfloat,32,64) # weights + ), + Cfloat.( + zeros(Cfloat,32,1) # biases + ), + # + # output layer (number of neurons == number of classes): + Cfloat.( + 0.1f0*randn(Cfloat,2,32) # weights + ), + Cfloat.( + zeros(Cfloat,2,1) # biases + ), + ] + # Define loss hyperparameters + knetmlp_losshyperparameters = Dict() + knetmlp_losshyperparameters[:L1] = Cfloat(0.0) + knetmlp_losshyperparameters[:L2] = Cfloat(0.0) + # Select optimization algorithm + knetmlp_optimizationalgorithm = :Momentum + # Set optimization hyperparameters + knetmlp_optimizerhyperparameters = Dict() + # Set the minibatch size + knetmlp_minibatchsize = 48 + # Set the max number of epochs. After training, look at the learning curve. If + # it looks like the model has not yet converged, raise maxepochs. If it looks + # like the loss has hit a plateau and you are worried about overfitting, lower + # maxepochs. + knetmlp_maxepochs = 1_000 + # Set up multilayer perceptron model + knetmlpclassifier = PredictMD.singlelabelmulticlassdataframeknetclassifier( + featurenames, + labelname, + labellevels; + package = :Knetjl, + name = "Knet MLP", + predict = knetmlp_predict, + loss = knetmlp_loss, + losshyperparameters = knetmlp_losshyperparameters, + optimizationalgorithm = knetmlp_optimizationalgorithm, + optimizerhyperparameters = knetmlp_optimizerhyperparameters, + minibatchsize = knetmlp_minibatchsize, + modelweights = knetmlp_modelweights, + printlosseverynepochs = 100, # if 0, will not print at all + maxepochs = knetmlp_maxepochs, + feature_contrasts = feature_contrasts, + ) +end + +if get(ENV, "LOADTRAINEDMODELSFROMFILE", "") == "true" +else + # Train multilayer perceptron model on training set + PredictMD.fit!( + knetmlpclassifier, + smoted_training_features_df, + smoted_training_labels_df, + validation_features_df, + validation_labels_df, + ) +end + +# Plot learning curve: loss vs. epoch +knet_learningcurve_lossvsepoch = PredictMD.plotlearningcurve( + knetmlpclassifier, + :loss_vs_epoch; + ) +PredictMD.open_plot(knet_learningcurve_lossvsepoch) + +# Plot learning curve: loss vs. epoch, skip the first 10 epochs +knet_learningcurve_lossvsepoch_skip10epochs = PredictMD.plotlearningcurve( + knetmlpclassifier, + :loss_vs_epoch; + startat = 10, + endat = :end, + ) +PredictMD.open_plot(knet_learningcurve_lossvsepoch_skip10epochs) + +# Plot learning curve: loss vs. iteration +knet_learningcurve_lossvsiteration = PredictMD.plotlearningcurve( + knetmlpclassifier, + :loss_vs_iteration; + window = 50, + sampleevery = 10, + ) +PredictMD.open_plot(knet_learningcurve_lossvsiteration) + +# Plot learning curve: loss vs. iteration, skip the first 100 iterations +knet_learningcurve_lossvsiteration_skip100iterations = PredictMD.plotlearningcurve( + knetmlpclassifier, + :loss_vs_iteration; + window = 50, + sampleevery = 10, + startat = 100, + endat = :end, + ) +PredictMD.open_plot(knet_learningcurve_lossvsiteration_skip100iterations) + +# Plot classifier histogram for multilayer perceptron on smoted training set +knetmlpclassifier_hist_training = PredictMD.plotsinglelabelbinaryclassifierhistogram( + knetmlpclassifier, + smoted_training_features_df, + smoted_training_labels_df, + labelname, + labellevels, + ) +PredictMD.open_plot(knetmlpclassifier_hist_training) + +# Plot classifier histogram for multilayer perceptron on testing set +knetmlpclassifier_hist_testing = PredictMD.plotsinglelabelbinaryclassifierhistogram( + knetmlpclassifier, + testing_features_df, + testing_labels_df, + labelname, + labellevels, + ) +PredictMD.open_plot(knetmlpclassifier_hist_testing) + +# Evaluate performance of multilayer perceptron on smoted training set +PredictMD.singlelabelbinaryclassificationmetrics( + knetmlpclassifier, + smoted_training_features_df, + smoted_training_labels_df, + labelname, + positiveclass; + sensitivity = 0.95, + ) + +# Evaluate performance of multilayer perceptron on testing set +PredictMD.singlelabelbinaryclassificationmetrics( + knetmlpclassifier, + testing_features_df, + testing_labels_df, + labelname, + positiveclass; + sensitivity = 0.95, + ) + +############################################################################## +############################################################################## +## Section 5: Compare performance of all models ############################## +############################################################################## +############################################################################## + +all_models = PredictMD.Fittable[ + logisticclassifier, + rfclassifier, + csvc_svmclassifier, + nusvc_svmclassifier, + knetmlpclassifier, + ] + +# Compare performance of all models on smoted training set +showall(PredictMD.singlelabelbinaryclassificationmetrics( + all_models, + training_features_df, + training_labels_df, + labelname, + positiveclass; + sensitivity = 0.95, + )) +showall(PredictMD.singlelabelbinaryclassificationmetrics( + all_models, + training_features_df, + training_labels_df, + labelname, + positiveclass; + specificity = 0.95, + )) +showall(PredictMD.singlelabelbinaryclassificationmetrics( + all_models, + training_features_df, + training_labels_df, + labelname, + positiveclass; + maximize = :f1score, + )) +showall(PredictMD.singlelabelbinaryclassificationmetrics( + all_models, + training_features_df, + training_labels_df, + labelname, + positiveclass; + maximize = :cohen_kappa, + )) + +# Compare performance of all models on testing set +showall(PredictMD.singlelabelbinaryclassificationmetrics( + all_models, + testing_features_df, + testing_labels_df, + labelname, + positiveclass; + sensitivity = 0.95, + )) +showall(PredictMD.singlelabelbinaryclassificationmetrics( + all_models, + testing_features_df, + testing_labels_df, + labelname, + positiveclass; + specificity = 0.95, + )) +showall(PredictMD.singlelabelbinaryclassificationmetrics( + all_models, + testing_features_df, + testing_labels_df, + labelname, + positiveclass; + maximize = :f1score, + )) +showall(PredictMD.singlelabelbinaryclassificationmetrics( + all_models, + testing_features_df, + testing_labels_df, + labelname, + positiveclass; + maximize = :cohen_kappa, + )) + +# Plot receiver operating characteristic curves for all models on testing set. +rocplottesting = PredictMD.plotroccurves( + all_models, + testing_features_df, + testing_labels_df, + labelname, + positiveclass, + ) +PredictMD.open_plot(rocplottesting) + +# Plot precision-recall curves for all models on testing set. +prplottesting = PredictMD.plotprcurves( + all_models, + testing_features_df, + testing_labels_df, + labelname, + positiveclass, + ) +PredictMD.open_plot(prplottesting) + +############################################################################## +############################################################################## +### Section 6: Save trained models to file (if desired) ####################### +############################################################################## +############################################################################## + +if get(ENV, "SAVETRAINEDMODELSTOFILE", "") == "true" + PredictMD.save_model(logisticclassifier_filename, logisticclassifier) + PredictMD.save_model(rfclassifier_filename, rfclassifier) + PredictMD.save_model(csvc_svmclassifier_filename, csvc_svmclassifier) + PredictMD.save_model(nusvc_svmclassifier_filename, nusvc_svmclassifier) + PredictMD.save_model(knetmlp_filename, knetmlpclassifier) +end + +############################################################################## +############################################################################## +## Appendix A: Directly access the output of classification models ########### +############################################################################## +############################################################################## + +# We can use the PredictMD.predict_proba() function to get the probabilities output +# by each of the classification models. + +# Get probabilities from each model for smoted training set +PredictMD.predict_proba(logisticclassifier,smoted_training_features_df,) +PredictMD.predict_proba(rfclassifier,smoted_training_features_df,) +PredictMD.predict_proba(csvc_svmclassifier,smoted_training_features_df,) +PredictMD.predict_proba(nusvc_svmclassifier,smoted_training_features_df,) +PredictMD.predict_proba(knetmlpclassifier,smoted_training_features_df,) + +# Get probabilities from each model for testing set +PredictMD.predict_proba(logisticclassifier,testing_features_df,) +PredictMD.predict_proba(rfclassifier,testing_features_df,) +PredictMD.predict_proba(csvc_svmclassifier,testing_features_df,) +PredictMD.predict_proba(nusvc_svmclassifier,testing_features_df,) +PredictMD.predict_proba(knetmlpclassifier,testing_features_df,) + +# If we want to get predicted classes instead of probabilities, we can use the +# PredictMD.predict() function to get the class predictions output by each of the +# classification models. For each sample, PredictMD.predict() will select the class +# with the highest probability. In the case of binary classification, this is +# equivalent to using a threshold of 0.5. + +# Get class predictions from each model for smoted training set +PredictMD.predict(logisticclassifier,smoted_training_features_df,) +PredictMD.predict(rfclassifier,smoted_training_features_df,) +PredictMD.predict(csvc_svmclassifier,smoted_training_features_df,) +PredictMD.predict(nusvc_svmclassifier,smoted_training_features_df,) +PredictMD.predict(knetmlpclassifier,smoted_training_features_df,) + +# Get class predictions from each model for testing set +PredictMD.predict(logisticclassifier,testing_features_df,) +PredictMD.predict(rfclassifier,testing_features_df,) +PredictMD.predict(csvc_svmclassifier,testing_features_df,) +PredictMD.predict(nusvc_svmclassifier,testing_features_df,) +PredictMD.predict(knetmlpclassifier,testing_features_df,) + +Base.Test.@test(isfile(ENV["logisticclassifier_filename"])) +Base.Test.@test(isfile(ENV["rfclassifier_filename"])) +Base.Test.@test(isfile(ENV["csvc_svmclassifier_filename"])) +Base.Test.@test(isfile(ENV["nusvc_svmclassifier_filename"])) +Base.Test.@test(isfile(ENV["knetmlp_filename"])) diff --git a/test/cpu/functional/bostonhousing/run_bostonhousing.jl b/test/cpu/functional/bostonhousing/run_bostonhousing.jl index ee4e9a92d..e69de29bb 100644 --- a/test/cpu/functional/bostonhousing/run_bostonhousing.jl +++ b/test/cpu/functional/bostonhousing/run_bostonhousing.jl @@ -1,458 +0,0 @@ -linearreg_filename = ENV["linearreg_filename"] -randomforestreg_filename = ENV["randomforestreg_filename"] -knetmlpreg_filename = ENV["knetmlpreg_filename"] - -############################################################################## -############################################################################## -### Section 1: Setup ######################################################### -############################################################################## -############################################################################## - -# import required packages -import PredictMD -import CSV -import DataFrames -import GZip -import Knet -import LIBSVM -import StatsBase - -# set the seed of the global random number generator -# this makes the results reproducible -srand(999) - -############################################################################## -############################################################################## -### Section 2: Prepare data ################################################## -############################################################################## -############################################################################## - -# Import Boston housing data -df = CSV.read( - GZip.gzopen(joinpath(Pkg.dir("RDatasets"),"data","MASS","Boston.csv.gz")), - DataFrames.DataFrame, - ) - -# Remove rows with missing data -DataFrames.dropmissing!(df) - -# Shuffle rows -PredictMD.shuffle_rows!(df) - -# Define labels -categoricalfeaturenames = Symbol[] -continuousfeaturenames = Symbol[ - :Crim, - :Zn, - :Indus, - :Chas, - :NOx, - :Rm, - :Age, - :Dis, - :Rad, - :Tax, - :PTRatio, - :Black, - :LStat, - ] -featurenames = vcat(categoricalfeaturenames, continuousfeaturenames) - -if get(ENV, "LOADTRAINEDMODELSFROMFILE", "") == "true" -else - feature_contrasts = PredictMD.generate_feature_contrasts(df, featurenames) -end - -# Define labels -labelname = :MedV - -# Put features and labels in separate dataframes -features_df = df[featurenames] -labels_df = df[[labelname]] - -# View summary statistics for label variable (mean, quartiles, etc.) -DataFrames.describe(labels_df[labelname]) - -# Split the data into training (50%), validation (25%), and testing (25%) -trainingandvalidation_features_df, - trainingandvalidation_labels_df, - testing_features_df, - testing_labels_df = PredictMD.split_data( - features_df, - labels_df, - 0.75, # 75% training+validation, 25% testing - ) -training_features_df, - training_labels_df, - validation_features_df, - validation_labels_df = PredictMD.split_data( - trainingandvalidation_features_df, - trainingandvalidation_labels_df, - 2/3, # 2/3 of 75% = 50% training, 1/3 of 75% = 25% validation - ) - -############################################################################## -############################################################################## -### Section 3: Set up and train models ####################################### -############################################################################## -############################################################################## - -############################################################################## -## Linear regression ######################################################### -############################################################################## - -if get(ENV, "LOADTRAINEDMODELSFROMFILE", "") == "true" - linearreg = PredictMD.load_model(linearreg_filename) -else - # Set up linear regression model - linearreg = PredictMD.singlelabeldataframelinearregression( - featurenames, - labelname; - package = :GLMjl, - intercept = true, # optional, defaults to true - interactions = 2, # optional, defaults to 1 - name = "Linear regression", # optional - ) -end - -if get(ENV, "LOADTRAINEDMODELSFROMFILE", "") == "true" -else - # Train linear regression model - PredictMD.fit!(linearreg,training_features_df,training_labels_df,) -end - -# View coefficients, p values, etc. for underlying linear regression -PredictMD.get_underlying(linearreg) - -# Plot true values versus predicted values for linear regression on training set -linearreg_plot_training = PredictMD.plotsinglelabelregressiontrueversuspredicted( - linearreg, - training_features_df, - training_labels_df, - labelname, - ) -PredictMD.open_plot(linearreg_plot_training) - -# Plot true values versus predicted values for linear regression on testing set -linearreg_plot_testing = PredictMD.plotsinglelabelregressiontrueversuspredicted( - linearreg, - testing_features_df, - testing_labels_df, - labelname - ) -PredictMD.open_plot(linearreg_plot_testing) - -# Evaluate performance of linear regression on training set -PredictMD.singlelabelregressionmetrics( - linearreg, - training_features_df, - training_labels_df, - labelname, - ) - -# Evaluate performance of linear regression on testing set -PredictMD.singlelabelregressionmetrics( - linearreg, - testing_features_df, - testing_labels_df, - labelname, - ) - -############################################################################## -## Random forest regression ################################################## -############################################################################## - -if get(ENV, "LOADTRAINEDMODELSFROMFILE", "") == "true" - randomforestreg = PredictMD.load_model(randomforestreg_filename) -else - # Set up random forest regression model - randomforestreg = PredictMD.singlelabeldataframerandomforestregression( - featurenames, - labelname; - nsubfeatures = 2, # number of subfeatures; defaults to 2 - ntrees = 20, # number of trees; defaults to 10 - package = :DecisionTreejl, - name = "Random forest", # optional - feature_contrasts = feature_contrasts, - ) -end - -if get(ENV, "LOADTRAINEDMODELSFROMFILE", "") == "true" -else - # Train random forest model on training set - PredictMD.fit!(randomforestreg,training_features_df,training_labels_df,) -end - -# Plot true values versus predicted values for random forest on training set -randomforestreg_plot_training = PredictMD.plotsinglelabelregressiontrueversuspredicted( - randomforestreg, - training_features_df, - training_labels_df, - labelname, - ) -PredictMD.open_plot(randomforestreg_plot_training) - -# Plot true values versus predicted values for random forest on testing set -randomforestreg_plot_testing = PredictMD.plotsinglelabelregressiontrueversuspredicted( - randomforestreg, - testing_features_df, - testing_labels_df, - labelname, - ) -PredictMD.open_plot(randomforestreg_plot_testing) - -# Evaluate performance of random forest on training set -PredictMD.singlelabelregressionmetrics( - randomforestreg, - training_features_df, - training_labels_df, - labelname, - ) - -# Evaluate performance of random forest on testing set -PredictMD.singlelabelregressionmetrics( - randomforestreg, - testing_features_df, - testing_labels_df, - labelname, - ) - -############################################################################## -## Multilayer perceptron (i.e. fully connected feedforward neural network) ### -############################################################################## - -# Define predict function -function knetmlp_predict( - w, # don't put a type annotation on this - x0::AbstractArray, - ) - # x0 = input layer - # x1 = hidden layer - x1 = Knet.relu.( w[1]*x0 .+ w[2] ) # w[1] = weights, w[2] = biases - # x2 = output layer - x2 = w[3]*x1 .+ w[4] # w[3] = weights, w[4] = biases - return x2 -end - -# Define loss function -function knetmlp_loss( - predict::Function, - modelweights, # don't put a type annotation on this - x::AbstractArray, - ytrue::AbstractArray; - L1::Real = Cfloat(0), - L2::Real = Cfloat(0), - ) - loss = mean( - abs2, - ytrue - predict( - modelweights, - x, - ), - ) - if L1 != 0 - loss += L1 * sum(sum(abs, w_i) for w_i in modelweights[1:2:end]) - end - if L2 != 0 - loss += L2 * sum(sum(abs2, w_i) for w_i in modelweights[1:2:end]) - end - return loss -end - -if get(ENV, "LOADTRAINEDMODELSFROMFILE", "") == "true" - knetmlpreg = PredictMD.load_model(knetmlpreg_filename) -else - # Randomly initialize model weights - knetmlp_modelweights = Any[ - # input layer has dimension contrasts.num_array_columns - # - # hidden layer (10 neurons): - Cfloat.( - 0.1f0*randn(Cfloat,10,feature_contrasts.num_array_columns) # weights - ), - Cfloat.( - zeros(Cfloat,10,1) # biases - ), - # - # output layer (regression nets have exactly 1 neuron in output layer): - Cfloat.( - 0.1f0*randn(Cfloat,1,10) # weights - ), - Cfloat.( - zeros(Cfloat,1,1) # biases - ), - ] - # Define loss hyperparameters - knetmlp_losshyperparameters = Dict() - knetmlp_losshyperparameters[:L1] = Cfloat(0.0) - knetmlp_losshyperparameters[:L2] = Cfloat(0.0) - # Select optimization algorithm - knetmlp_optimizationalgorithm = :Adam - # Set optimization hyperparameters - knetmlp_optimizerhyperparameters = Dict() - # Set the minibatch size - knetmlp_minibatchsize = 48 - # Set the max number of epochs. After training, look at the learning curve. If - # it looks like the model has not yet converged, raise maxepochs. If it looks - # like the loss has hit a plateau and you are worried about overfitting, lower - # maxepochs. - knetmlp_maxepochs = 1_000 - # Set up multilayer perceptron model - knetmlpreg = PredictMD.singlelabeldataframeknetregression( - featurenames, - labelname; - package = :Knetjl, - name = "Knet MLP", - predict = knetmlp_predict, - loss = knetmlp_loss, - losshyperparameters = knetmlp_losshyperparameters, - optimizationalgorithm = knetmlp_optimizationalgorithm, - optimizerhyperparameters = knetmlp_optimizerhyperparameters, - minibatchsize = knetmlp_minibatchsize, - modelweights = knetmlp_modelweights, - maxepochs = knetmlp_maxepochs, - printlosseverynepochs = 100, # if 0, will not print at all - feature_contrasts = feature_contrasts, - ) -end - -if get(ENV, "LOADTRAINEDMODELSFROMFILE", "") == "true" -else - # Train multilayer perceptron model on training set - PredictMD.fit!( - knetmlpreg, - training_features_df, - training_labels_df, - validation_features_df, - validation_labels_df, - ) -end - -# Plot learning curve: loss vs. epoch -knet_learningcurve_lossvsepoch = PredictMD.plotlearningcurve( - knetmlpreg, - :loss_vs_epoch; - ) -PredictMD.open_plot(knet_learningcurve_lossvsepoch) - -# Plot learning curve: loss vs. epoch, skip the first 10 epochs -knet_learningcurve_lossvsepoch_skip10epochs = PredictMD.plotlearningcurve( - knetmlpreg, - :loss_vs_epoch; - startat = 10, - endat = :end, - ) -PredictMD.open_plot(knet_learningcurve_lossvsepoch_skip10epochs) - -# Plot learning curve: loss vs. iteration -knet_learningcurve_lossvsiteration = PredictMD.plotlearningcurve( - knetmlpreg, - :loss_vs_iteration; - window = 50, - sampleevery = 10, - ) -PredictMD.open_plot(knet_learningcurve_lossvsiteration) - -# Plot learning curve: loss vs. iteration, skip the first 100 iterations -knet_learningcurve_lossvsiteration_skip100iterations = PredictMD.plotlearningcurve( - knetmlpreg, - :loss_vs_iteration; - window = 50, - sampleevery = 10, - startat = 100, - endat = :end, - ) -PredictMD.open_plot(knet_learningcurve_lossvsiteration_skip100iterations) - -# Plot true values versus predicted values for multilayer perceptron on training set -knetmlpreg_plot_training = PredictMD.plotsinglelabelregressiontrueversuspredicted( - knetmlpreg, - training_features_df, - training_labels_df, - labelname, - ) -PredictMD.open_plot(knetmlpreg_plot_training) - -# Plot true values versus predicted values for multilayer perceptron on testing set -knetmlpreg_plot_testing = PredictMD.plotsinglelabelregressiontrueversuspredicted( - knetmlpreg, - testing_features_df, - testing_labels_df, - labelname, - ) -PredictMD.open_plot(knetmlpreg_plot_testing) - -# Evaluate performance of multilayer perceptron on training set -PredictMD.singlelabelregressionmetrics( - knetmlpreg, - training_features_df, - training_labels_df, - labelname, - ) - -# Evaluate performance of multilayer perceptron on testing set -PredictMD.singlelabelregressionmetrics( - knetmlpreg, - testing_features_df, - testing_labels_df, - labelname, - ) - -############################################################################## -############################################################################## -### Section 4: Compare performance of all models ############################# -############################################################################## -############################################################################## - -all_models = PredictMD.Fittable[ - linearreg, - randomforestreg, - knetmlpreg, - ] - -# Compare performance of all five models on training set -showall(PredictMD.singlelabelregressionmetrics( - all_models, - training_features_df, - training_labels_df, - labelname, - )) - -# Compare performance of all models on testing set -showall(PredictMD.singlelabelregressionmetrics( - all_models, - testing_features_df, - testing_labels_df, - labelname, - )) - -############################################################################## -############################################################################## -### Section 5: Save trained models to file (if desired) ####################### -############################################################################## -############################################################################## - -if get(ENV, "SAVETRAINEDMODELSTOFILE", "") == "true" - PredictMD.save_model(linearreg_filename, linearreg) - PredictMD.save_model(randomforestreg_filename, randomforestreg) - PredictMD.save_model(knetmlpreg_filename, knetmlpreg) -end - -############################################################################## -############################################################################## -## Appendix A: Directly access the output of regression models ############### -############################################################################## -############################################################################## - -# We can use the PredictMD.predict() function to get the real-valued predictions -# output by each of regression models. - -# Get real-valued predictions from each model for training set -PredictMD.predict(linearreg,training_features_df,) -PredictMD.predict(randomforestreg,training_features_df,) -PredictMD.predict(knetmlpreg,training_features_df,) - -# Get real-valued predictions from each model for testing set -PredictMD.predict(linearreg,testing_features_df,) -PredictMD.predict(randomforestreg,testing_features_df,) -PredictMD.predict(knetmlpreg,testing_features_df,) diff --git a/test/cpu/functional/bostonhousing/setup_bostonhousing.jl b/test/cpu/functional/bostonhousing/setup_bostonhousing.jl index 2194d7243..e69de29bb 100644 --- a/test/cpu/functional/bostonhousing/setup_bostonhousing.jl +++ b/test/cpu/functional/bostonhousing/setup_bostonhousing.jl @@ -1,23 +0,0 @@ -ENV["linearreg_filename"] = string(tempname(), "_linearreg.jld2") -ENV["randomforestreg_filename"] = string(tempname(), "_randomforestreg.jld2") -ENV["knetmlpreg_filename"] = string(tempname(), "_knetmlpreg.jld2") - -Base.Test.@test(!isfile(ENV["linearreg_filename"])) -Base.Test.@test(!isfile(ENV["randomforestreg_filename"])) -Base.Test.@test(!isfile(ENV["knetmlpreg_filename"])) - -ENV["LOADTRAINEDMODELSFROMFILE"] = "false" -ENV["SAVETRAINEDMODELSTOFILE"] = "true" -include("run_bostonhousing.jl") - -Base.Test.@test(isfile(ENV["linearreg_filename"])) -Base.Test.@test(isfile(ENV["randomforestreg_filename"])) -Base.Test.@test(isfile(ENV["knetmlpreg_filename"])) - -ENV["LOADTRAINEDMODELSFROMFILE"] = "true" -ENV["SAVETRAINEDMODELSTOFILE"] = "false" -include("run_bostonhousing.jl") - -Base.Test.@test(isfile(ENV["linearreg_filename"])) -Base.Test.@test(isfile(ENV["randomforestreg_filename"])) -Base.Test.@test(isfile(ENV["knetmlpreg_filename"])) diff --git a/test/cpu/functional/breastcancerbiopsy/run_breastcancerbiopsy.jl b/test/cpu/functional/breastcancerbiopsy/run_breastcancerbiopsy.jl index 829537a75..e69de29bb 100644 --- a/test/cpu/functional/breastcancerbiopsy/run_breastcancerbiopsy.jl +++ b/test/cpu/functional/breastcancerbiopsy/run_breastcancerbiopsy.jl @@ -1,839 +0,0 @@ -logisticclassifier_filename = ENV["logisticclassifier_filename"] -rfclassifier_filename = ENV["rfclassifier_filename"] -csvc_svmclassifier_filename = ENV["csvc_svmclassifier_filename"] -nusvc_svmclassifier_filename = ENV["nusvc_svmclassifier_filename"] -knetmlp_filename = ENV["knetmlp_filename"] - -############################################################################## -############################################################################## -### Section 1: Setup ######################################################### -############################################################################## -############################################################################## - -# import required packages -import PredictMD -import DataFrames -import Knet -import LIBSVM -import RDatasets -import StatsBase - -# set the seed of the global random number generator -# this makes the results reproducible -srand(999) - -############################################################################## -############################################################################## -### Section 2: Prepare data ################################################## -############################################################################## -############################################################################## - -# Import breast cancer biopsy data -df = RDatasets.dataset("MASS", "biopsy") - -# Remove rows with missing data -DataFrames.dropmissing!(df) - -# Shuffle rows -PredictMD.shuffle_rows!(df) - -# Define features -categoricalfeaturenames = Symbol[] -continuousfeaturenames = Symbol[ - :V1, - :V2, - :V3, - :V4, - :V5, - :V6, - :V7, - :V8, - :V9, - ] -featurenames = vcat(categoricalfeaturenames, continuousfeaturenames) - -if get(ENV, "LOADTRAINEDMODELSFROMFILE", "") == "true" -else - feature_contrasts = PredictMD.generate_feature_contrasts(df, featurenames) -end - -# Define labels -labelname = :Class -negativeclass = "benign" -positiveclass = "malignant" -labellevels = [negativeclass, positiveclass] - -# Put features and labels in separate dataframes -features_df = df[featurenames] -labels_df = df[[labelname]] - -# Split the data into training (50%), validation (25%), and testing (25%) -trainingandvalidation_features_df, - trainingandvalidation_labels_df, - testing_features_df, - testing_labels_df = PredictMD.split_data( - features_df, - labels_df, - 0.75, # 75% training+validation, 25% testing - ) -training_features_df, - training_labels_df, - validation_features_df, - validation_labels_df = PredictMD.split_data( - trainingandvalidation_features_df, - trainingandvalidation_labels_df, - 2/3, # 2/3 of 75% = 50% training, 1/3 of 75% = 25% validation - ) - -############################################################################## -############################################################################## -### Section 3: Apply the SMOTE algorithm to the training set ################# -############################################################################## -############################################################################## - -# Examine prevalence of each class in training set -# DataFrames.describe(training_labels_df[labelname]) -StatsBase.countmap(training_labels_df[labelname]) - -# We see that malignant is minority class and benign is majority class. -# The ratio of malignant:benign is somewhere between 1:2.5 and 1:3 (depending -# on random seed). We would like that ratio to be 1:1. We will use SMOTE -# to generate synthetic minority class samples. We will also undersample the -# minority class. The result will be a balanced training set. -majorityclass = "benign" -minorityclass = "malignant" - -smoted_training_features_df, smoted_training_labels_df = PredictMD.smote( - training_features_df, - training_labels_df, - featurenames, - labelname; - majorityclass = majorityclass, - minorityclass = minorityclass, - pct_over = 100, # how much to oversample the minority class - minority_to_majority_ratio = 1.0, # desired minority:majority ratio - k = 5, - ) - -# Examine prevalence of each class in smoted training set -# DataFrames.describe(smoted_training_labels_df[labelname]) -StatsBase.countmap(smoted_training_labels_df[labelname]) - -# Now we have a ratio of malignant:benign that is 1:1. - -############################################################################## -############################################################################## -### Section 4: Set up and train models ####################################### -############################################################################## -############################################################################## - -############################################################################## -## Logistic "regression" classifier ########################################## -############################################################################## - -if get(ENV, "LOADTRAINEDMODELSFROMFILE", "") == "true" - logisticclassifier = PredictMD.load_model(logisticclassifier_filename) -else - # Set up logistic classifier model - logisticclassifier = PredictMD.singlelabelbinaryclassdataframelogisticclassifier( - featurenames, - labelname, - labellevels; - package = :GLMjl, - intercept = true, # optional, defaults to true - interactions = 1, # optional, defaults to 1 - name = "Logistic regression", # optional - ) -end - -if get(ENV, "LOADTRAINEDMODELSFROMFILE", "") == "true" -else - # Train logistic classifier model on smoted training set - PredictMD.fit!( - logisticclassifier, - smoted_training_features_df, - smoted_training_labels_df, - ) -end - -# View coefficients, p values, etc. for underlying logistic regression -PredictMD.get_underlying(logisticclassifier) - -# Plot classifier histogram for logistic classifier on smoted training set -logistic_hist_training = PredictMD.plotsinglelabelbinaryclassifierhistogram( - logisticclassifier, - smoted_training_features_df, - smoted_training_labels_df, - labelname, - labellevels, - ) -PredictMD.open_plot(logistic_hist_training) - -# Plot classifier histogram for logistic classifier on testing set -logistic_hist_testing = PredictMD.plotsinglelabelbinaryclassifierhistogram( - logisticclassifier, - testing_features_df, - testing_labels_df, - labelname, - labellevels, - ) -PredictMD.open_plot(logistic_hist_testing) - -# Evaluate performance of logistic classifier on smoted training set -PredictMD.singlelabelbinaryclassificationmetrics( - logisticclassifier, - testing_features_df, - testing_labels_df, - labelname, - positiveclass; - sensitivity = 0.95, - ) - -# Evaluate performance of logistic classifier on testing set -PredictMD.singlelabelbinaryclassificationmetrics( - logisticclassifier, - testing_features_df, - testing_labels_df, - labelname, - positiveclass; - sensitivity = 0.95, - ) - -logistic_calibration_curve = PredictMD.plot_probability_calibration_curve( - logisticclassifier, - smoted_training_features_df, - smoted_training_labels_df, - labelname, - positiveclass; - window = 0.2, - ) -PredictMD.open_plot(logistic_calibration_curve) - -PredictMD.probability_calibration_metrics( - logisticclassifier, - testing_features_df, - testing_labels_df, - labelname, - positiveclass; - window = 0.1, - ) - -logistic_cutoffs, logistic_risk_group_prevalences = PredictMD.risk_score_cutoff_values( - logisticclassifier, - testing_features_df, - testing_labels_df, - labelname, - positiveclass; - average_function = mean, - ) -println( - string( - "Low risk: 0 to $(logistic_cutoffs[1]).", - " Medium risk: $(logistic_cutoffs[1]) to $(logistic_cutoffs[2]).", - " High risk: $(logistic_cutoffs[2]) to 1.", - ) - ) -showall(logistic_risk_group_prevalences) -logistic_cutoffs, logistic_risk_group_prevalences = PredictMD.risk_score_cutoff_values( - logisticclassifier, - testing_features_df, - testing_labels_df, - labelname, - positiveclass; - average_function = median, - ) -println( - string( - "Low risk: 0 to $(logistic_cutoffs[1]).", - " Medium risk: $(logistic_cutoffs[1]) to $(logistic_cutoffs[2]).", - " High risk: $(logistic_cutoffs[2]) to 1.", - ) - ) -showall(logistic_risk_group_prevalences) - -############################################################################## -## Random forest classifier ################################################## -############################################################################## - -if get(ENV, "LOADTRAINEDMODELSFROMFILE", "") == "true" - rfclassifier = PredictMD.load_model(rfclassifier_filename) -else - # Set up random forest classifier model - rfclassifier = PredictMD.singlelabelmulticlassdataframerandomforestclassifier( - featurenames, - labelname, - labellevels; - nsubfeatures = 4, # number of subfeatures; defaults to 2 - ntrees = 200, # number of trees; defaults to 10 - package = :DecisionTreejl, - name = "Random forest", # optional - feature_contrasts = feature_contrasts, - ) -end - -if get(ENV, "LOADTRAINEDMODELSFROMFILE", "") == "true" -else - # Train random forest classifier model on smoted training set - PredictMD.fit!( - rfclassifier, - smoted_training_features_df, - smoted_training_labels_df, - ) -end - -# Plot classifier histogram for random forest classifier on smoted training set -rfclassifier_hist_training = PredictMD.plotsinglelabelbinaryclassifierhistogram( - rfclassifier, - smoted_training_features_df, - smoted_training_labels_df, - labelname, - labellevels, - ) -PredictMD.open_plot(rfclassifier_hist_training) - -# Plot classifier histogram for random forest classifier on testing set -rfclassifier_hist_testing = PredictMD.plotsinglelabelbinaryclassifierhistogram( - rfclassifier, - testing_features_df, - testing_labels_df, - labelname, - labellevels, - ) -PredictMD.open_plot(rfclassifier_hist_testing) - -# Evaluate performance of random forest classifier on smoted training set -PredictMD.singlelabelbinaryclassificationmetrics( - rfclassifier, - smoted_training_features_df, - smoted_training_labels_df, - labelname, - positiveclass; - sensitivity = 0.95, - ) - -# Evaluate performance of random forest on testing set -PredictMD.singlelabelbinaryclassificationmetrics( - rfclassifier, - testing_features_df, - testing_labels_df, - labelname, - positiveclass; - sensitivity = 0.95, - ) - -rf_calibration_curve = PredictMD.plot_probability_calibration_curve( - rfclassifier, - testing_features_df, - testing_labels_df, - labelname, - positiveclass; - window = 0.1, - ) -PredictMD.open_plot(rf_calibration_curve) - -############################################################################## -## Support vector machine (C support vector classifier) ###################### -############################################################################## - -if get(ENV, "LOADTRAINEDMODELSFROMFILE", "") == "true" - csvc_svmclassifier = PredictMD.load_model(csvc_svmclassifier_filename) -else - # Set up C-SVC model - csvc_svmclassifier = PredictMD.singlelabelmulticlassdataframesvmclassifier( - featurenames, - labelname, - labellevels; - package = :LIBSVMjl, - svmtype = LIBSVM.SVC, - name = "SVM (C-SVC)", - verbose = false, - feature_contrasts = feature_contrasts, - ) -end - -if get(ENV, "LOADTRAINEDMODELSFROMFILE", "") == "true" -else - # Train C-SVC model on smoted training set - PredictMD.fit!( - csvc_svmclassifier, - smoted_training_features_df, - smoted_training_labels_df, - ) -end - -# Plot classifier histogram for C-SVC on smoted training set -csvc_svmclassifier_hist_training = PredictMD.plotsinglelabelbinaryclassifierhistogram( - csvc_svmclassifier, - smoted_training_features_df, - smoted_training_labels_df, - labelname, - labellevels, - ) -PredictMD.open_plot(csvc_svmclassifier_hist_training) - -# Plot classifier histogram for C-SVC on testing set -csvc_svmclassifier_hist_testing = PredictMD.plotsinglelabelbinaryclassifierhistogram( - csvc_svmclassifier, - testing_features_df, - testing_labels_df, - labelname, - labellevels, - ) -PredictMD.open_plot(csvc_svmclassifier_hist_testing) - -# Evaluate performance of C-SVC on smoted training set -PredictMD.singlelabelbinaryclassificationmetrics( - csvc_svmclassifier, - smoted_training_features_df, - smoted_training_labels_df, - labelname, - positiveclass; - sensitivity = 0.95, - ) - -# Evaluate performance of C-SVC on testing set -PredictMD.singlelabelbinaryclassificationmetrics( - csvc_svmclassifier, - testing_features_df, - testing_labels_df, - labelname, - positiveclass; - sensitivity = 0.95, - ) - -############################################################################## -## Support vector machine (nu support vector classifier) ##################### -############################################################################## - -if get(ENV, "LOADTRAINEDMODELSFROMFILE", "") == "true" - nusvc_svmclassifier = PredictMD.load_model(nusvc_svmclassifier_filename) -else - # Set up nu-SVC model - nusvc_svmclassifier = PredictMD.singlelabelmulticlassdataframesvmclassifier( - featurenames, - labelname, - labellevels; - package = :LIBSVMjl, - svmtype = LIBSVM.NuSVC, - name = "SVM (nu-SVC)", - verbose = false, - feature_contrasts = feature_contrasts, - ) -end - -if get(ENV, "LOADTRAINEDMODELSFROMFILE", "") == "true" -else - # Train nu-SVC model on smoted training set - PredictMD.fit!( - nusvc_svmclassifier, - smoted_training_features_df, - smoted_training_labels_df, - ) -end - -# Plot classifier histogram for nu-SVC on smoted training set -nusvc_svmclassifier_hist_training = PredictMD.plotsinglelabelbinaryclassifierhistogram( - nusvc_svmclassifier, - smoted_training_features_df, - smoted_training_labels_df, - labelname, - labellevels, - ) -PredictMD.open_plot(nusvc_svmclassifier_hist_training) - -# Plot classifier histogram for nu-SVC on testing set -nusvc_svmclassifier_hist_testing = PredictMD.plotsinglelabelbinaryclassifierhistogram( - nusvc_svmclassifier, - testing_features_df, - testing_labels_df, - labelname, - labellevels, - ) -PredictMD.open_plot(nusvc_svmclassifier_hist_testing) - -# Evaluate performance of nu-SVC on smoted training set -PredictMD.singlelabelbinaryclassificationmetrics( - nusvc_svmclassifier, - smoted_training_features_df, - smoted_training_labels_df, - labelname, - positiveclass; - sensitivity = 0.95, - ) - -# Evaluate performance of SVM on testing set -PredictMD.singlelabelbinaryclassificationmetrics( - nusvc_svmclassifier, - testing_features_df, - testing_labels_df, - labelname, - positiveclass; - sensitivity = 0.95, - ) - -############################################################################## -## Multilayer perceptron (i.e. fully connected feedforward neural network) ### -############################################################################## - -# Define predict function -function knetmlp_predict( - w, # don't put a type annotation on this - x0::AbstractArray; - probabilities::Bool = true, - ) - # x0 = input layer - # x1 = first hidden layer - x1 = Knet.relu.( w[1]*x0 .+ w[2] ) # w[1] = weights, w[2] = biases - # x2 = second hidden layer - x2 = Knet.relu.( w[3]*x1 .+ w[4] ) # w[3] = weights, w[4] = biases - # x3 = output layer - x3 = w[5]*x2 .+ w[6] # w[5] = weights, w[6] = biases - unnormalizedlogprobs = x3 - if probabilities - normalizedlogprobs = Knet.logp(unnormalizedlogprobs, 1) - normalizedprobs = exp.(normalizedlogprobs) - return normalizedprobs - else - return unnormalizedlogprobs - end -end - -# Define loss function -function knetmlp_loss( - predict::Function, - modelweights, # don't put a type annotation on this - x::AbstractArray, - ytrue::AbstractArray; - L1::Real = Cfloat(0), - L2::Real = Cfloat(0), - ) - loss = Knet.nll( - predict( - modelweights, - x; - probabilities = false, - ), - ytrue, - 1, # d = 1 means that instances are in columns - ) - if L1 != 0 - loss += L1 * sum(sum(abs, w_i) for w_i in modelweights[1:2:end]) - end - if L2 != 0 - loss += L2 * sum(sum(abs2, w_i) for w_i in modelweights[1:2:end]) - end - return loss -end - -if get(ENV, "LOADTRAINEDMODELSFROMFILE", "") == "true" - knetmlpclassifier = PredictMD.load_model(knetmlp_filename) -else - # Randomly initialize model weights - knetmlp_modelweights = Any[ - # input layer has dimension contrasts.num_array_columns - # - # first hidden layer (64 neurons): - Cfloat.( - 0.1f0*randn(Cfloat,64,feature_contrasts.num_array_columns) # weights - ), - Cfloat.( - zeros(Cfloat,64,1) # biases - ), - # - # second hidden layer (32 neurons): - Cfloat.( - 0.1f0*randn(Cfloat,32,64) # weights - ), - Cfloat.( - zeros(Cfloat,32,1) # biases - ), - # - # output layer (number of neurons == number of classes): - Cfloat.( - 0.1f0*randn(Cfloat,2,32) # weights - ), - Cfloat.( - zeros(Cfloat,2,1) # biases - ), - ] - # Define loss hyperparameters - knetmlp_losshyperparameters = Dict() - knetmlp_losshyperparameters[:L1] = Cfloat(0.0) - knetmlp_losshyperparameters[:L2] = Cfloat(0.0) - # Select optimization algorithm - knetmlp_optimizationalgorithm = :Momentum - # Set optimization hyperparameters - knetmlp_optimizerhyperparameters = Dict() - # Set the minibatch size - knetmlp_minibatchsize = 48 - # Set the max number of epochs. After training, look at the learning curve. If - # it looks like the model has not yet converged, raise maxepochs. If it looks - # like the loss has hit a plateau and you are worried about overfitting, lower - # maxepochs. - knetmlp_maxepochs = 1_000 - # Set up multilayer perceptron model - knetmlpclassifier = PredictMD.singlelabelmulticlassdataframeknetclassifier( - featurenames, - labelname, - labellevels; - package = :Knetjl, - name = "Knet MLP", - predict = knetmlp_predict, - loss = knetmlp_loss, - losshyperparameters = knetmlp_losshyperparameters, - optimizationalgorithm = knetmlp_optimizationalgorithm, - optimizerhyperparameters = knetmlp_optimizerhyperparameters, - minibatchsize = knetmlp_minibatchsize, - modelweights = knetmlp_modelweights, - printlosseverynepochs = 100, # if 0, will not print at all - maxepochs = knetmlp_maxepochs, - feature_contrasts = feature_contrasts, - ) -end - -if get(ENV, "LOADTRAINEDMODELSFROMFILE", "") == "true" -else - # Train multilayer perceptron model on training set - PredictMD.fit!( - knetmlpclassifier, - smoted_training_features_df, - smoted_training_labels_df, - validation_features_df, - validation_labels_df, - ) -end - -# Plot learning curve: loss vs. epoch -knet_learningcurve_lossvsepoch = PredictMD.plotlearningcurve( - knetmlpclassifier, - :loss_vs_epoch; - ) -PredictMD.open_plot(knet_learningcurve_lossvsepoch) - -# Plot learning curve: loss vs. epoch, skip the first 10 epochs -knet_learningcurve_lossvsepoch_skip10epochs = PredictMD.plotlearningcurve( - knetmlpclassifier, - :loss_vs_epoch; - startat = 10, - endat = :end, - ) -PredictMD.open_plot(knet_learningcurve_lossvsepoch_skip10epochs) - -# Plot learning curve: loss vs. iteration -knet_learningcurve_lossvsiteration = PredictMD.plotlearningcurve( - knetmlpclassifier, - :loss_vs_iteration; - window = 50, - sampleevery = 10, - ) -PredictMD.open_plot(knet_learningcurve_lossvsiteration) - -# Plot learning curve: loss vs. iteration, skip the first 100 iterations -knet_learningcurve_lossvsiteration_skip100iterations = PredictMD.plotlearningcurve( - knetmlpclassifier, - :loss_vs_iteration; - window = 50, - sampleevery = 10, - startat = 100, - endat = :end, - ) -PredictMD.open_plot(knet_learningcurve_lossvsiteration_skip100iterations) - -# Plot classifier histogram for multilayer perceptron on smoted training set -knetmlpclassifier_hist_training = PredictMD.plotsinglelabelbinaryclassifierhistogram( - knetmlpclassifier, - smoted_training_features_df, - smoted_training_labels_df, - labelname, - labellevels, - ) -PredictMD.open_plot(knetmlpclassifier_hist_training) - -# Plot classifier histogram for multilayer perceptron on testing set -knetmlpclassifier_hist_testing = PredictMD.plotsinglelabelbinaryclassifierhistogram( - knetmlpclassifier, - testing_features_df, - testing_labels_df, - labelname, - labellevels, - ) -PredictMD.open_plot(knetmlpclassifier_hist_testing) - -# Evaluate performance of multilayer perceptron on smoted training set -PredictMD.singlelabelbinaryclassificationmetrics( - knetmlpclassifier, - smoted_training_features_df, - smoted_training_labels_df, - labelname, - positiveclass; - sensitivity = 0.95, - ) - -# Evaluate performance of multilayer perceptron on testing set -PredictMD.singlelabelbinaryclassificationmetrics( - knetmlpclassifier, - testing_features_df, - testing_labels_df, - labelname, - positiveclass; - sensitivity = 0.95, - ) - -############################################################################## -############################################################################## -## Section 5: Compare performance of all models ############################## -############################################################################## -############################################################################## - -all_models = PredictMD.Fittable[ - logisticclassifier, - rfclassifier, - csvc_svmclassifier, - nusvc_svmclassifier, - knetmlpclassifier, - ] - -# Compare performance of all models on smoted training set -showall(PredictMD.singlelabelbinaryclassificationmetrics( - all_models, - training_features_df, - training_labels_df, - labelname, - positiveclass; - sensitivity = 0.95, - )) -showall(PredictMD.singlelabelbinaryclassificationmetrics( - all_models, - training_features_df, - training_labels_df, - labelname, - positiveclass; - specificity = 0.95, - )) -showall(PredictMD.singlelabelbinaryclassificationmetrics( - all_models, - training_features_df, - training_labels_df, - labelname, - positiveclass; - maximize = :f1score, - )) -showall(PredictMD.singlelabelbinaryclassificationmetrics( - all_models, - training_features_df, - training_labels_df, - labelname, - positiveclass; - maximize = :cohen_kappa, - )) - -# Compare performance of all models on testing set -showall(PredictMD.singlelabelbinaryclassificationmetrics( - all_models, - testing_features_df, - testing_labels_df, - labelname, - positiveclass; - sensitivity = 0.95, - )) -showall(PredictMD.singlelabelbinaryclassificationmetrics( - all_models, - testing_features_df, - testing_labels_df, - labelname, - positiveclass; - specificity = 0.95, - )) -showall(PredictMD.singlelabelbinaryclassificationmetrics( - all_models, - testing_features_df, - testing_labels_df, - labelname, - positiveclass; - maximize = :f1score, - )) -showall(PredictMD.singlelabelbinaryclassificationmetrics( - all_models, - testing_features_df, - testing_labels_df, - labelname, - positiveclass; - maximize = :cohen_kappa, - )) - -# Plot receiver operating characteristic curves for all models on testing set. -rocplottesting = PredictMD.plotroccurves( - all_models, - testing_features_df, - testing_labels_df, - labelname, - positiveclass, - ) -PredictMD.open_plot(rocplottesting) - -# Plot precision-recall curves for all models on testing set. -prplottesting = PredictMD.plotprcurves( - all_models, - testing_features_df, - testing_labels_df, - labelname, - positiveclass, - ) -PredictMD.open_plot(prplottesting) - -############################################################################## -############################################################################## -### Section 6: Save trained models to file (if desired) ####################### -############################################################################## -############################################################################## - -if get(ENV, "SAVETRAINEDMODELSTOFILE", "") == "true" - PredictMD.save_model(logisticclassifier_filename, logisticclassifier) - PredictMD.save_model(rfclassifier_filename, rfclassifier) - PredictMD.save_model(csvc_svmclassifier_filename, csvc_svmclassifier) - PredictMD.save_model(nusvc_svmclassifier_filename, nusvc_svmclassifier) - PredictMD.save_model(knetmlp_filename, knetmlpclassifier) -end - -############################################################################## -############################################################################## -## Appendix A: Directly access the output of classification models ########### -############################################################################## -############################################################################## - -# We can use the PredictMD.predict_proba() function to get the probabilities output -# by each of the classification models. - -# Get probabilities from each model for smoted training set -PredictMD.predict_proba(logisticclassifier,smoted_training_features_df,) -PredictMD.predict_proba(rfclassifier,smoted_training_features_df,) -PredictMD.predict_proba(csvc_svmclassifier,smoted_training_features_df,) -PredictMD.predict_proba(nusvc_svmclassifier,smoted_training_features_df,) -PredictMD.predict_proba(knetmlpclassifier,smoted_training_features_df,) - -# Get probabilities from each model for testing set -PredictMD.predict_proba(logisticclassifier,testing_features_df,) -PredictMD.predict_proba(rfclassifier,testing_features_df,) -PredictMD.predict_proba(csvc_svmclassifier,testing_features_df,) -PredictMD.predict_proba(nusvc_svmclassifier,testing_features_df,) -PredictMD.predict_proba(knetmlpclassifier,testing_features_df,) - -# If we want to get predicted classes instead of probabilities, we can use the -# PredictMD.predict() function to get the class predictions output by each of the -# classification models. For each sample, PredictMD.predict() will select the class -# with the highest probability. In the case of binary classification, this is -# equivalent to using a threshold of 0.5. - -# Get class predictions from each model for smoted training set -PredictMD.predict(logisticclassifier,smoted_training_features_df,) -PredictMD.predict(rfclassifier,smoted_training_features_df,) -PredictMD.predict(csvc_svmclassifier,smoted_training_features_df,) -PredictMD.predict(nusvc_svmclassifier,smoted_training_features_df,) -PredictMD.predict(knetmlpclassifier,smoted_training_features_df,) - -# Get class predictions from each model for testing set -PredictMD.predict(logisticclassifier,testing_features_df,) -PredictMD.predict(rfclassifier,testing_features_df,) -PredictMD.predict(csvc_svmclassifier,testing_features_df,) -PredictMD.predict(nusvc_svmclassifier,testing_features_df,) -PredictMD.predict(knetmlpclassifier,testing_features_df,) diff --git a/test/cpu/functional/breastcancerbiopsy/setup_breastcancerbiopsy.jl b/test/cpu/functional/breastcancerbiopsy/setup_breastcancerbiopsy.jl index bb4f6ee1d..e69de29bb 100644 --- a/test/cpu/functional/breastcancerbiopsy/setup_breastcancerbiopsy.jl +++ b/test/cpu/functional/breastcancerbiopsy/setup_breastcancerbiopsy.jl @@ -1,31 +0,0 @@ -ENV["logisticclassifier_filename"] = string(tempname(), "_logisticclassifier.jld2") -ENV["rfclassifier_filename"] = string(tempname(), "_rfclassifier.jld2") -ENV["csvc_svmclassifier_filename"] = string(tempname(), "_csvc_svmclassifier.jld2") -ENV["nusvc_svmclassifier_filename"] = string(tempname(), "_nusvc_svmclassifier.jld2") -ENV["knetmlp_filename"] = string(tempname(), "_knetmlpclassifier.jld2") - -Base.Test.@test(!isfile(ENV["logisticclassifier_filename"])) -Base.Test.@test(!isfile(ENV["rfclassifier_filename"])) -Base.Test.@test(!isfile(ENV["csvc_svmclassifier_filename"])) -Base.Test.@test(!isfile(ENV["nusvc_svmclassifier_filename"])) -Base.Test.@test(!isfile(ENV["knetmlp_filename"])) - -ENV["LOADTRAINEDMODELSFROMFILE"] = "false" -ENV["SAVETRAINEDMODELSTOFILE"] = "true" -include("run_breastcancerbiopsy.jl") - -Base.Test.@test(isfile(ENV["logisticclassifier_filename"])) -Base.Test.@test(isfile(ENV["rfclassifier_filename"])) -Base.Test.@test(isfile(ENV["csvc_svmclassifier_filename"])) -Base.Test.@test(isfile(ENV["nusvc_svmclassifier_filename"])) -Base.Test.@test(isfile(ENV["knetmlp_filename"])) - -ENV["LOADTRAINEDMODELSFROMFILE"] = "true" -ENV["SAVETRAINEDMODELSTOFILE"] = "false" -include("run_breastcancerbiopsy.jl") - -Base.Test.@test(isfile(ENV["logisticclassifier_filename"])) -Base.Test.@test(isfile(ENV["rfclassifier_filename"])) -Base.Test.@test(isfile(ENV["csvc_svmclassifier_filename"])) -Base.Test.@test(isfile(ENV["nusvc_svmclassifier_filename"])) -Base.Test.@test(isfile(ENV["knetmlp_filename"])) From 4f8492578e3df5c2abec223d9aab91498663e769 Mon Sep 17 00:00:00 2001 From: Dilum Aluthge Date: Mon, 21 May 2018 18:12:17 -0400 Subject: [PATCH 28/62] Progress commit --- test/cpu/functional/bostonhousing/run_bostonhousing.jl | 0 test/cpu/functional/bostonhousing/setup_bostonhousing.jl | 0 test/cpu/functional/breastcancerbiopsy/run_breastcancerbiopsy.jl | 0 .../cpu/functional/breastcancerbiopsy/setup_breastcancerbiopsy.jl | 0 4 files changed, 0 insertions(+), 0 deletions(-) delete mode 100644 test/cpu/functional/bostonhousing/run_bostonhousing.jl delete mode 100644 test/cpu/functional/bostonhousing/setup_bostonhousing.jl delete mode 100644 test/cpu/functional/breastcancerbiopsy/run_breastcancerbiopsy.jl delete mode 100644 test/cpu/functional/breastcancerbiopsy/setup_breastcancerbiopsy.jl diff --git a/test/cpu/functional/bostonhousing/run_bostonhousing.jl b/test/cpu/functional/bostonhousing/run_bostonhousing.jl deleted file mode 100644 index e69de29bb..000000000 diff --git a/test/cpu/functional/bostonhousing/setup_bostonhousing.jl b/test/cpu/functional/bostonhousing/setup_bostonhousing.jl deleted file mode 100644 index e69de29bb..000000000 diff --git a/test/cpu/functional/breastcancerbiopsy/run_breastcancerbiopsy.jl b/test/cpu/functional/breastcancerbiopsy/run_breastcancerbiopsy.jl deleted file mode 100644 index e69de29bb..000000000 diff --git a/test/cpu/functional/breastcancerbiopsy/setup_breastcancerbiopsy.jl b/test/cpu/functional/breastcancerbiopsy/setup_breastcancerbiopsy.jl deleted file mode 100644 index e69de29bb..000000000 From a0f8b3f3c23eb52c767a6ddb98ec2eeb865a23cb Mon Sep 17 00:00:00 2001 From: Dilum Aluthge Date: Mon, 21 May 2018 20:40:16 -0400 Subject: [PATCH 29/62] Update .gitignore --- .gitignore | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index a127ba002..57e474a1d 100644 --- a/.gitignore +++ b/.gitignore @@ -5,6 +5,7 @@ data/ docs/build/ docs/generated/ docs/site/ +docs/src/examples/ input/ output/ @@ -21,4 +22,3 @@ deps/deps.jl deps/pdf2svg.svg deps/showed_warning scratch.jl - From 9dbcdcfd5d21c86a00468ff58ba2121938ac82af Mon Sep 17 00:00:00 2001 From: Dilum Aluthge Date: Mon, 21 May 2018 21:51:18 -0400 Subject: [PATCH 30/62] Progress commit --- docs/make_docs.jl | 50 ++++++++++--------- .../{ => boston_housing}/boston_housing.jl | 2 - .../breast_cancer_biopsy.jl | 0 3 files changed, 27 insertions(+), 25 deletions(-) rename examples/{ => boston_housing}/boston_housing.jl (99%) rename examples/{ => breast_cancer_biopsy}/breast_cancer_biopsy.jl (100%) diff --git a/docs/make_docs.jl b/docs/make_docs.jl index f20aeeb73..020109ab2 100644 --- a/docs/make_docs.jl +++ b/docs/make_docs.jl @@ -1,47 +1,51 @@ import Documenter import Literate + +info("DEBUG: importing PredictMD") + import PredictMD -examples_input_directory = joinpath( +info("DEBUG: using Literate.jl to generate examples") + +examples_output_parent_directory = joinpath( @__DIR__, - "..", + "src", "examples", ) - -examples_output_directory = joinpath( +examples_input_parent_directory = joinpath( @__DIR__, - "", - "", - "", - "", + "..", + "examples", ) -Literate.markdown( - , - examples_output_directory, +boston_housing_output_directory = joinpath( + examples_output_parent_directory, + "boston_housing", ) -Literate.notebook( - , - examples_output_directory, +boston_housing_input_directory = joinpath( + examples_input_parent_directory, + "boston_housing", ) -Literate.script( - , - examples_output_directory, +boston_housing_input_file = joinpath( + boston_housing_input_directory, + "boston_housing.jl", ) Literate.markdown( - , - examples_output_directory, + boston_housing_input_file, + boston_housing_output_directory, ) Literate.notebook( - , - examples_output_directory, + boston_housing_input_file, + boston_housing_output_directory, ) Literate.script( - , - examples_output_directory, + boston_housing_input_file, + boston_housing_output_directory, ) +info("DEBUG: using Documenter.jl to generate Markdown docs") + Documenter.makedocs( modules = [PredictMD], sitename = "PredictMD.jl", diff --git a/examples/boston_housing.jl b/examples/boston_housing/boston_housing.jl similarity index 99% rename from examples/boston_housing.jl rename to examples/boston_housing/boston_housing.jl index e7ea79123..b10ae3baf 100644 --- a/examples/boston_housing.jl +++ b/examples/boston_housing/boston_housing.jl @@ -699,7 +699,6 @@ PredictMD.singlelabelregressionmetrics( ## Multilayer perceptron (i.e. fully connected feedforward neural network) ### ############################################################################## -# Define predict function function knetmlp_predict( w, # don't put a type annotation on this x0::AbstractArray, @@ -712,7 +711,6 @@ function knetmlp_predict( return x2 end -# Define loss function function knetmlp_loss( predict::Function, modelweights, # don't put a type annotation on this diff --git a/examples/breast_cancer_biopsy.jl b/examples/breast_cancer_biopsy/breast_cancer_biopsy.jl similarity index 100% rename from examples/breast_cancer_biopsy.jl rename to examples/breast_cancer_biopsy/breast_cancer_biopsy.jl From 5fe3c6ffc99caf23c0de6a86feb47269b41beb05 Mon Sep 17 00:00:00 2001 From: Dilum Aluthge Date: Mon, 21 May 2018 22:16:49 -0400 Subject: [PATCH 31/62] Progress commit --- docs/make_docs.jl | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/docs/make_docs.jl b/docs/make_docs.jl index 020109ab2..5f4ab948c 100644 --- a/docs/make_docs.jl +++ b/docs/make_docs.jl @@ -26,22 +26,23 @@ boston_housing_input_directory = joinpath( examples_input_parent_directory, "boston_housing", ) -boston_housing_input_file = joinpath( - boston_housing_input_directory, - "boston_housing.jl", - ) Literate.markdown( - boston_housing_input_file, - boston_housing_output_directory, + joinpath(boston_housing_input_directory, "boston_housing.jl"), + boston_housing_output_directory; + documenter = true, ) Literate.notebook( - boston_housing_input_file, - boston_housing_output_directory, + joinpath(boston_housing_input_directory, "boston_housing.jl"), + boston_housing_output_directory; + documenter = true, + execute = false, ) Literate.script( - boston_housing_input_file, - boston_housing_output_directory, + joinpath(boston_housing_input_directory, "boston_housing.jl"), + boston_housing_output_directory; + documenter = true, + keep_comments = true, ) info("DEBUG: using Documenter.jl to generate Markdown docs") From c8c324e58c4d9f8597537efb7cb26bbaa9fa5caf Mon Sep 17 00:00:00 2001 From: Dilum Aluthge Date: Mon, 21 May 2018 22:35:40 -0400 Subject: [PATCH 32/62] Progress commit --- docs/mkdocs.yml | 1 + .../boston_housing/{boston_housing.jl => OLD_boston_housing.jl} | 0 .../{breast_cancer_biopsy.jl => OLD_breast_cancer_biopsy.jl} | 0 3 files changed, 1 insertion(+) rename examples/boston_housing/{boston_housing.jl => OLD_boston_housing.jl} (100%) rename examples/breast_cancer_biopsy/{breast_cancer_biopsy.jl => OLD_breast_cancer_biopsy.jl} (100%) diff --git a/docs/mkdocs.yml b/docs/mkdocs.yml index 453cd9ee5..87e73e7b1 100644 --- a/docs/mkdocs.yml +++ b/docs/mkdocs.yml @@ -23,5 +23,6 @@ docs_dir: 'build' pages: - Home: index.md - Examples: examples.md + - Boy: examples/boston_housing/boston_housing.md - Library: - 'Internals': 'library/internals.md' diff --git a/examples/boston_housing/boston_housing.jl b/examples/boston_housing/OLD_boston_housing.jl similarity index 100% rename from examples/boston_housing/boston_housing.jl rename to examples/boston_housing/OLD_boston_housing.jl diff --git a/examples/breast_cancer_biopsy/breast_cancer_biopsy.jl b/examples/breast_cancer_biopsy/OLD_breast_cancer_biopsy.jl similarity index 100% rename from examples/breast_cancer_biopsy/breast_cancer_biopsy.jl rename to examples/breast_cancer_biopsy/OLD_breast_cancer_biopsy.jl From 5f6c956162a412b2e9b633d4158a3962b7b0ec83 Mon Sep 17 00:00:00 2001 From: Dilum Aluthge Date: Mon, 21 May 2018 23:26:53 -0400 Subject: [PATCH 33/62] Progress commit --- examples/boston_housing/OLD_boston_housing.jl | 939 ------------------ examples/boston_housing/compare_models.jl | 83 ++ examples/boston_housing/get_model_output.jl | 71 ++ .../boston_housing/knet_mlp_regression.jl | 215 ++++ examples/boston_housing/linear_regression.jl | 106 ++ examples/boston_housing/preprocess_data.jl | 148 +++ .../random_forest_regression.jl | 105 ++ 7 files changed, 728 insertions(+), 939 deletions(-) create mode 100644 examples/boston_housing/compare_models.jl create mode 100644 examples/boston_housing/get_model_output.jl create mode 100644 examples/boston_housing/knet_mlp_regression.jl create mode 100644 examples/boston_housing/linear_regression.jl create mode 100644 examples/boston_housing/preprocess_data.jl create mode 100644 examples/boston_housing/random_forest_regression.jl diff --git a/examples/boston_housing/OLD_boston_housing.jl b/examples/boston_housing/OLD_boston_housing.jl index b10ae3baf..e69de29bb 100644 --- a/examples/boston_housing/OLD_boston_housing.jl +++ b/examples/boston_housing/OLD_boston_housing.jl @@ -1,939 +0,0 @@ -ENV["linearreg_filename"] = string(tempname(), "_linearreg.jld2") -ENV["randomforestreg_filename"] = string(tempname(), "_randomforestreg.jld2") -ENV["knetmlpreg_filename"] = string(tempname(), "_knetmlpreg.jld2") - -Base.Test.@test(!isfile(ENV["linearreg_filename"])) -Base.Test.@test(!isfile(ENV["randomforestreg_filename"])) -Base.Test.@test(!isfile(ENV["knetmlpreg_filename"])) - -ENV["LOADTRAINEDMODELSFROMFILE"] = "false" -ENV["SAVETRAINEDMODELSTOFILE"] = "true" - -linearreg_filename = ENV["linearreg_filename"] -randomforestreg_filename = ENV["randomforestreg_filename"] -knetmlpreg_filename = ENV["knetmlpreg_filename"] - -############################################################################## -############################################################################## -### Section 1: Setup ######################################################### -############################################################################## -############################################################################## - -# import required packages -import PredictMD -import CSV -import DataFrames -import GZip -import Knet -import LIBSVM -import StatsBase - -# set the seed of the global random number generator -# this makes the results reproducible -srand(999) - -############################################################################## -############################################################################## -### Section 2: Prepare data ################################################## -############################################################################## -############################################################################## - -# Import Boston housing data -df = CSV.read( - GZip.gzopen(joinpath(Pkg.dir("RDatasets"),"data","MASS","Boston.csv.gz")), - DataFrames.DataFrame, - ) - -# Remove rows with missing data -DataFrames.dropmissing!(df) - -# Shuffle rows -PredictMD.shuffle_rows!(df) - -# Define labels -categoricalfeaturenames = Symbol[] -continuousfeaturenames = Symbol[ - :Crim, - :Zn, - :Indus, - :Chas, - :NOx, - :Rm, - :Age, - :Dis, - :Rad, - :Tax, - :PTRatio, - :Black, - :LStat, - ] -featurenames = vcat(categoricalfeaturenames, continuousfeaturenames) - -if get(ENV, "LOADTRAINEDMODELSFROMFILE", "") == "true" -else - feature_contrasts = PredictMD.generate_feature_contrasts(df, featurenames) -end - -# Define labels -labelname = :MedV - -# Put features and labels in separate dataframes -features_df = df[featurenames] -labels_df = df[[labelname]] - -# View summary statistics for label variable (mean, quartiles, etc.) -DataFrames.describe(labels_df[labelname]) - -# Split the data into training (50%), validation (25%), and testing (25%) -trainingandvalidation_features_df, - trainingandvalidation_labels_df, - testing_features_df, - testing_labels_df = PredictMD.split_data( - features_df, - labels_df, - 0.75, # 75% training+validation, 25% testing - ) -training_features_df, - training_labels_df, - validation_features_df, - validation_labels_df = PredictMD.split_data( - trainingandvalidation_features_df, - trainingandvalidation_labels_df, - 2/3, # 2/3 of 75% = 50% training, 1/3 of 75% = 25% validation - ) - -############################################################################## -############################################################################## -### Section 3: Set up and train models ####################################### -############################################################################## -############################################################################## - -############################################################################## -## Linear regression ######################################################### -############################################################################## - -if get(ENV, "LOADTRAINEDMODELSFROMFILE", "") == "true" - linearreg = PredictMD.load_model(linearreg_filename) -else - # Set up linear regression model - linearreg = PredictMD.singlelabeldataframelinearregression( - featurenames, - labelname; - package = :GLMjl, - intercept = true, # optional, defaults to true - interactions = 2, # optional, defaults to 1 - name = "Linear regression", # optional - ) -end - -if get(ENV, "LOADTRAINEDMODELSFROMFILE", "") == "true" -else - # Train linear regression model - PredictMD.fit!(linearreg,training_features_df,training_labels_df,) -end - -# View coefficients, p values, etc. for underlying linear regression -PredictMD.get_underlying(linearreg) - -# Plot true values versus predicted values for linear regression on training set -linearreg_plot_training = PredictMD.plotsinglelabelregressiontrueversuspredicted( - linearreg, - training_features_df, - training_labels_df, - labelname, - ) -PredictMD.open_plot(linearreg_plot_training) - -# Plot true values versus predicted values for linear regression on testing set -linearreg_plot_testing = PredictMD.plotsinglelabelregressiontrueversuspredicted( - linearreg, - testing_features_df, - testing_labels_df, - labelname - ) -PredictMD.open_plot(linearreg_plot_testing) - -# Evaluate performance of linear regression on training set -PredictMD.singlelabelregressionmetrics( - linearreg, - training_features_df, - training_labels_df, - labelname, - ) - -# Evaluate performance of linear regression on testing set -PredictMD.singlelabelregressionmetrics( - linearreg, - testing_features_df, - testing_labels_df, - labelname, - ) - -############################################################################## -## Random forest regression ################################################## -############################################################################## - -if get(ENV, "LOADTRAINEDMODELSFROMFILE", "") == "true" - randomforestreg = PredictMD.load_model(randomforestreg_filename) -else - # Set up random forest regression model - randomforestreg = PredictMD.singlelabeldataframerandomforestregression( - featurenames, - labelname; - nsubfeatures = 2, # number of subfeatures; defaults to 2 - ntrees = 20, # number of trees; defaults to 10 - package = :DecisionTreejl, - name = "Random forest", # optional - feature_contrasts = feature_contrasts, - ) -end - -if get(ENV, "LOADTRAINEDMODELSFROMFILE", "") == "true" -else - # Train random forest model on training set - PredictMD.fit!(randomforestreg,training_features_df,training_labels_df,) -end - -# Plot true values versus predicted values for random forest on training set -randomforestreg_plot_training = PredictMD.plotsinglelabelregressiontrueversuspredicted( - randomforestreg, - training_features_df, - training_labels_df, - labelname, - ) -PredictMD.open_plot(randomforestreg_plot_training) - -# Plot true values versus predicted values for random forest on testing set -randomforestreg_plot_testing = PredictMD.plotsinglelabelregressiontrueversuspredicted( - randomforestreg, - testing_features_df, - testing_labels_df, - labelname, - ) -PredictMD.open_plot(randomforestreg_plot_testing) - -# Evaluate performance of random forest on training set -PredictMD.singlelabelregressionmetrics( - randomforestreg, - training_features_df, - training_labels_df, - labelname, - ) - -# Evaluate performance of random forest on testing set -PredictMD.singlelabelregressionmetrics( - randomforestreg, - testing_features_df, - testing_labels_df, - labelname, - ) - -############################################################################## -## Multilayer perceptron (i.e. fully connected feedforward neural network) ### -############################################################################## - -# Define predict function -function knetmlp_predict( - w, # don't put a type annotation on this - x0::AbstractArray, - ) - # x0 = input layer - # x1 = hidden layer - x1 = Knet.relu.( w[1]*x0 .+ w[2] ) # w[1] = weights, w[2] = biases - # x2 = output layer - x2 = w[3]*x1 .+ w[4] # w[3] = weights, w[4] = biases - return x2 -end - -# Define loss function -function knetmlp_loss( - predict::Function, - modelweights, # don't put a type annotation on this - x::AbstractArray, - ytrue::AbstractArray; - L1::Real = Cfloat(0), - L2::Real = Cfloat(0), - ) - loss = mean( - abs2, - ytrue - predict( - modelweights, - x, - ), - ) - if L1 != 0 - loss += L1 * sum(sum(abs, w_i) for w_i in modelweights[1:2:end]) - end - if L2 != 0 - loss += L2 * sum(sum(abs2, w_i) for w_i in modelweights[1:2:end]) - end - return loss -end - -if get(ENV, "LOADTRAINEDMODELSFROMFILE", "") == "true" - knetmlpreg = PredictMD.load_model(knetmlpreg_filename) -else - # Randomly initialize model weights - knetmlp_modelweights = Any[ - # input layer has dimension contrasts.num_array_columns - # - # hidden layer (10 neurons): - Cfloat.( - 0.1f0*randn(Cfloat,10,feature_contrasts.num_array_columns) # weights - ), - Cfloat.( - zeros(Cfloat,10,1) # biases - ), - # - # output layer (regression nets have exactly 1 neuron in output layer): - Cfloat.( - 0.1f0*randn(Cfloat,1,10) # weights - ), - Cfloat.( - zeros(Cfloat,1,1) # biases - ), - ] - # Define loss hyperparameters - knetmlp_losshyperparameters = Dict() - knetmlp_losshyperparameters[:L1] = Cfloat(0.0) - knetmlp_losshyperparameters[:L2] = Cfloat(0.0) - # Select optimization algorithm - knetmlp_optimizationalgorithm = :Adam - # Set optimization hyperparameters - knetmlp_optimizerhyperparameters = Dict() - # Set the minibatch size - knetmlp_minibatchsize = 48 - # Set the max number of epochs. After training, look at the learning curve. If - # it looks like the model has not yet converged, raise maxepochs. If it looks - # like the loss has hit a plateau and you are worried about overfitting, lower - # maxepochs. - knetmlp_maxepochs = 1_000 - # Set up multilayer perceptron model - knetmlpreg = PredictMD.singlelabeldataframeknetregression( - featurenames, - labelname; - package = :Knetjl, - name = "Knet MLP", - predict = knetmlp_predict, - loss = knetmlp_loss, - losshyperparameters = knetmlp_losshyperparameters, - optimizationalgorithm = knetmlp_optimizationalgorithm, - optimizerhyperparameters = knetmlp_optimizerhyperparameters, - minibatchsize = knetmlp_minibatchsize, - modelweights = knetmlp_modelweights, - maxepochs = knetmlp_maxepochs, - printlosseverynepochs = 100, # if 0, will not print at all - feature_contrasts = feature_contrasts, - ) -end - -if get(ENV, "LOADTRAINEDMODELSFROMFILE", "") == "true" -else - # Train multilayer perceptron model on training set - PredictMD.fit!( - knetmlpreg, - training_features_df, - training_labels_df, - validation_features_df, - validation_labels_df, - ) -end - -# Plot learning curve: loss vs. epoch -knet_learningcurve_lossvsepoch = PredictMD.plotlearningcurve( - knetmlpreg, - :loss_vs_epoch; - ) -PredictMD.open_plot(knet_learningcurve_lossvsepoch) - -# Plot learning curve: loss vs. epoch, skip the first 10 epochs -knet_learningcurve_lossvsepoch_skip10epochs = PredictMD.plotlearningcurve( - knetmlpreg, - :loss_vs_epoch; - startat = 10, - endat = :end, - ) -PredictMD.open_plot(knet_learningcurve_lossvsepoch_skip10epochs) - -# Plot learning curve: loss vs. iteration -knet_learningcurve_lossvsiteration = PredictMD.plotlearningcurve( - knetmlpreg, - :loss_vs_iteration; - window = 50, - sampleevery = 10, - ) -PredictMD.open_plot(knet_learningcurve_lossvsiteration) - -# Plot learning curve: loss vs. iteration, skip the first 100 iterations -knet_learningcurve_lossvsiteration_skip100iterations = PredictMD.plotlearningcurve( - knetmlpreg, - :loss_vs_iteration; - window = 50, - sampleevery = 10, - startat = 100, - endat = :end, - ) -PredictMD.open_plot(knet_learningcurve_lossvsiteration_skip100iterations) - -# Plot true values versus predicted values for multilayer perceptron on training set -knetmlpreg_plot_training = PredictMD.plotsinglelabelregressiontrueversuspredicted( - knetmlpreg, - training_features_df, - training_labels_df, - labelname, - ) -PredictMD.open_plot(knetmlpreg_plot_training) - -# Plot true values versus predicted values for multilayer perceptron on testing set -knetmlpreg_plot_testing = PredictMD.plotsinglelabelregressiontrueversuspredicted( - knetmlpreg, - testing_features_df, - testing_labels_df, - labelname, - ) -PredictMD.open_plot(knetmlpreg_plot_testing) - -# Evaluate performance of multilayer perceptron on training set -PredictMD.singlelabelregressionmetrics( - knetmlpreg, - training_features_df, - training_labels_df, - labelname, - ) - -# Evaluate performance of multilayer perceptron on testing set -PredictMD.singlelabelregressionmetrics( - knetmlpreg, - testing_features_df, - testing_labels_df, - labelname, - ) - -############################################################################## -############################################################################## -### Section 4: Compare performance of all models ############################# -############################################################################## -############################################################################## - -all_models = PredictMD.Fittable[ - linearreg, - randomforestreg, - knetmlpreg, - ] - -# Compare performance of all five models on training set -showall(PredictMD.singlelabelregressionmetrics( - all_models, - training_features_df, - training_labels_df, - labelname, - )) - -# Compare performance of all models on testing set -showall(PredictMD.singlelabelregressionmetrics( - all_models, - testing_features_df, - testing_labels_df, - labelname, - )) - -############################################################################## -############################################################################## -### Section 5: Save trained models to file (if desired) ####################### -############################################################################## -############################################################################## - -if get(ENV, "SAVETRAINEDMODELSTOFILE", "") == "true" - PredictMD.save_model(linearreg_filename, linearreg) - PredictMD.save_model(randomforestreg_filename, randomforestreg) - PredictMD.save_model(knetmlpreg_filename, knetmlpreg) -end - -############################################################################## -############################################################################## -## Appendix A: Directly access the output of regression models ############### -############################################################################## -############################################################################## - -# We can use the PredictMD.predict() function to get the real-valued predictions -# output by each of regression models. - -# Get real-valued predictions from each model for training set -PredictMD.predict(linearreg,training_features_df,) -PredictMD.predict(randomforestreg,training_features_df,) -PredictMD.predict(knetmlpreg,training_features_df,) - -# Get real-valued predictions from each model for testing set -PredictMD.predict(linearreg,testing_features_df,) -PredictMD.predict(randomforestreg,testing_features_df,) -PredictMD.predict(knetmlpreg,testing_features_df,) - - -Base.Test.@test(isfile(ENV["linearreg_filename"])) -Base.Test.@test(isfile(ENV["randomforestreg_filename"])) -Base.Test.@test(isfile(ENV["knetmlpreg_filename"])) - -ENV["LOADTRAINEDMODELSFROMFILE"] = "true" -ENV["SAVETRAINEDMODELSTOFILE"] = "false" - -linearreg_filename = ENV["linearreg_filename"] -randomforestreg_filename = ENV["randomforestreg_filename"] -knetmlpreg_filename = ENV["knetmlpreg_filename"] - -############################################################################## -############################################################################## -### Section 1: Setup ######################################################### -############################################################################## -############################################################################## - -# import required packages -import PredictMD -import CSV -import DataFrames -import GZip -import Knet -import LIBSVM -import StatsBase - -# set the seed of the global random number generator -# this makes the results reproducible -srand(999) - -############################################################################## -############################################################################## -### Section 2: Prepare data ################################################## -############################################################################## -############################################################################## - -# Import Boston housing data -df = CSV.read( - GZip.gzopen(joinpath(Pkg.dir("RDatasets"),"data","MASS","Boston.csv.gz")), - DataFrames.DataFrame, - ) - -# Remove rows with missing data -DataFrames.dropmissing!(df) - -# Shuffle rows -PredictMD.shuffle_rows!(df) - -# Define labels -categoricalfeaturenames = Symbol[] -continuousfeaturenames = Symbol[ - :Crim, - :Zn, - :Indus, - :Chas, - :NOx, - :Rm, - :Age, - :Dis, - :Rad, - :Tax, - :PTRatio, - :Black, - :LStat, - ] -featurenames = vcat(categoricalfeaturenames, continuousfeaturenames) - -if get(ENV, "LOADTRAINEDMODELSFROMFILE", "") == "true" -else - feature_contrasts = PredictMD.generate_feature_contrasts(df, featurenames) -end - -# Define labels -labelname = :MedV - -# Put features and labels in separate dataframes -features_df = df[featurenames] -labels_df = df[[labelname]] - -# View summary statistics for label variable (mean, quartiles, etc.) -DataFrames.describe(labels_df[labelname]) - -# Split the data into training (50%), validation (25%), and testing (25%) -trainingandvalidation_features_df, - trainingandvalidation_labels_df, - testing_features_df, - testing_labels_df = PredictMD.split_data( - features_df, - labels_df, - 0.75, # 75% training+validation, 25% testing - ) -training_features_df, - training_labels_df, - validation_features_df, - validation_labels_df = PredictMD.split_data( - trainingandvalidation_features_df, - trainingandvalidation_labels_df, - 2/3, # 2/3 of 75% = 50% training, 1/3 of 75% = 25% validation - ) - -############################################################################## -############################################################################## -### Section 3: Set up and train models ####################################### -############################################################################## -############################################################################## - -############################################################################## -## Linear regression ######################################################### -############################################################################## - -if get(ENV, "LOADTRAINEDMODELSFROMFILE", "") == "true" - linearreg = PredictMD.load_model(linearreg_filename) -else - # Set up linear regression model - linearreg = PredictMD.singlelabeldataframelinearregression( - featurenames, - labelname; - package = :GLMjl, - intercept = true, # optional, defaults to true - interactions = 2, # optional, defaults to 1 - name = "Linear regression", # optional - ) -end - -if get(ENV, "LOADTRAINEDMODELSFROMFILE", "") == "true" -else - # Train linear regression model - PredictMD.fit!(linearreg,training_features_df,training_labels_df,) -end - -# View coefficients, p values, etc. for underlying linear regression -PredictMD.get_underlying(linearreg) - -# Plot true values versus predicted values for linear regression on training set -linearreg_plot_training = PredictMD.plotsinglelabelregressiontrueversuspredicted( - linearreg, - training_features_df, - training_labels_df, - labelname, - ) -PredictMD.open_plot(linearreg_plot_training) - -# Plot true values versus predicted values for linear regression on testing set -linearreg_plot_testing = PredictMD.plotsinglelabelregressiontrueversuspredicted( - linearreg, - testing_features_df, - testing_labels_df, - labelname - ) -PredictMD.open_plot(linearreg_plot_testing) - -# Evaluate performance of linear regression on training set -PredictMD.singlelabelregressionmetrics( - linearreg, - training_features_df, - training_labels_df, - labelname, - ) - -# Evaluate performance of linear regression on testing set -PredictMD.singlelabelregressionmetrics( - linearreg, - testing_features_df, - testing_labels_df, - labelname, - ) - -############################################################################## -## Random forest regression ################################################## -############################################################################## - -if get(ENV, "LOADTRAINEDMODELSFROMFILE", "") == "true" - randomforestreg = PredictMD.load_model(randomforestreg_filename) -else - # Set up random forest regression model - randomforestreg = PredictMD.singlelabeldataframerandomforestregression( - featurenames, - labelname; - nsubfeatures = 2, # number of subfeatures; defaults to 2 - ntrees = 20, # number of trees; defaults to 10 - package = :DecisionTreejl, - name = "Random forest", # optional - feature_contrasts = feature_contrasts, - ) -end - -if get(ENV, "LOADTRAINEDMODELSFROMFILE", "") == "true" -else - # Train random forest model on training set - PredictMD.fit!(randomforestreg,training_features_df,training_labels_df,) -end - -# Plot true values versus predicted values for random forest on training set -randomforestreg_plot_training = PredictMD.plotsinglelabelregressiontrueversuspredicted( - randomforestreg, - training_features_df, - training_labels_df, - labelname, - ) -PredictMD.open_plot(randomforestreg_plot_training) - -# Plot true values versus predicted values for random forest on testing set -randomforestreg_plot_testing = PredictMD.plotsinglelabelregressiontrueversuspredicted( - randomforestreg, - testing_features_df, - testing_labels_df, - labelname, - ) -PredictMD.open_plot(randomforestreg_plot_testing) - -# Evaluate performance of random forest on training set -PredictMD.singlelabelregressionmetrics( - randomforestreg, - training_features_df, - training_labels_df, - labelname, - ) - -# Evaluate performance of random forest on testing set -PredictMD.singlelabelregressionmetrics( - randomforestreg, - testing_features_df, - testing_labels_df, - labelname, - ) - -############################################################################## -## Multilayer perceptron (i.e. fully connected feedforward neural network) ### -############################################################################## - -function knetmlp_predict( - w, # don't put a type annotation on this - x0::AbstractArray, - ) - # x0 = input layer - # x1 = hidden layer - x1 = Knet.relu.( w[1]*x0 .+ w[2] ) # w[1] = weights, w[2] = biases - # x2 = output layer - x2 = w[3]*x1 .+ w[4] # w[3] = weights, w[4] = biases - return x2 -end - -function knetmlp_loss( - predict::Function, - modelweights, # don't put a type annotation on this - x::AbstractArray, - ytrue::AbstractArray; - L1::Real = Cfloat(0), - L2::Real = Cfloat(0), - ) - loss = mean( - abs2, - ytrue - predict( - modelweights, - x, - ), - ) - if L1 != 0 - loss += L1 * sum(sum(abs, w_i) for w_i in modelweights[1:2:end]) - end - if L2 != 0 - loss += L2 * sum(sum(abs2, w_i) for w_i in modelweights[1:2:end]) - end - return loss -end - -if get(ENV, "LOADTRAINEDMODELSFROMFILE", "") == "true" - knetmlpreg = PredictMD.load_model(knetmlpreg_filename) -else - # Randomly initialize model weights - knetmlp_modelweights = Any[ - # input layer has dimension contrasts.num_array_columns - # - # hidden layer (10 neurons): - Cfloat.( - 0.1f0*randn(Cfloat,10,feature_contrasts.num_array_columns) # weights - ), - Cfloat.( - zeros(Cfloat,10,1) # biases - ), - # - # output layer (regression nets have exactly 1 neuron in output layer): - Cfloat.( - 0.1f0*randn(Cfloat,1,10) # weights - ), - Cfloat.( - zeros(Cfloat,1,1) # biases - ), - ] - # Define loss hyperparameters - knetmlp_losshyperparameters = Dict() - knetmlp_losshyperparameters[:L1] = Cfloat(0.0) - knetmlp_losshyperparameters[:L2] = Cfloat(0.0) - # Select optimization algorithm - knetmlp_optimizationalgorithm = :Adam - # Set optimization hyperparameters - knetmlp_optimizerhyperparameters = Dict() - # Set the minibatch size - knetmlp_minibatchsize = 48 - # Set the max number of epochs. After training, look at the learning curve. If - # it looks like the model has not yet converged, raise maxepochs. If it looks - # like the loss has hit a plateau and you are worried about overfitting, lower - # maxepochs. - knetmlp_maxepochs = 1_000 - # Set up multilayer perceptron model - knetmlpreg = PredictMD.singlelabeldataframeknetregression( - featurenames, - labelname; - package = :Knetjl, - name = "Knet MLP", - predict = knetmlp_predict, - loss = knetmlp_loss, - losshyperparameters = knetmlp_losshyperparameters, - optimizationalgorithm = knetmlp_optimizationalgorithm, - optimizerhyperparameters = knetmlp_optimizerhyperparameters, - minibatchsize = knetmlp_minibatchsize, - modelweights = knetmlp_modelweights, - maxepochs = knetmlp_maxepochs, - printlosseverynepochs = 100, # if 0, will not print at all - feature_contrasts = feature_contrasts, - ) -end - -if get(ENV, "LOADTRAINEDMODELSFROMFILE", "") == "true" -else - # Train multilayer perceptron model on training set - PredictMD.fit!( - knetmlpreg, - training_features_df, - training_labels_df, - validation_features_df, - validation_labels_df, - ) -end - -# Plot learning curve: loss vs. epoch -knet_learningcurve_lossvsepoch = PredictMD.plotlearningcurve( - knetmlpreg, - :loss_vs_epoch; - ) -PredictMD.open_plot(knet_learningcurve_lossvsepoch) - -# Plot learning curve: loss vs. epoch, skip the first 10 epochs -knet_learningcurve_lossvsepoch_skip10epochs = PredictMD.plotlearningcurve( - knetmlpreg, - :loss_vs_epoch; - startat = 10, - endat = :end, - ) -PredictMD.open_plot(knet_learningcurve_lossvsepoch_skip10epochs) - -# Plot learning curve: loss vs. iteration -knet_learningcurve_lossvsiteration = PredictMD.plotlearningcurve( - knetmlpreg, - :loss_vs_iteration; - window = 50, - sampleevery = 10, - ) -PredictMD.open_plot(knet_learningcurve_lossvsiteration) - -# Plot learning curve: loss vs. iteration, skip the first 100 iterations -knet_learningcurve_lossvsiteration_skip100iterations = PredictMD.plotlearningcurve( - knetmlpreg, - :loss_vs_iteration; - window = 50, - sampleevery = 10, - startat = 100, - endat = :end, - ) -PredictMD.open_plot(knet_learningcurve_lossvsiteration_skip100iterations) - -# Plot true values versus predicted values for multilayer perceptron on training set -knetmlpreg_plot_training = PredictMD.plotsinglelabelregressiontrueversuspredicted( - knetmlpreg, - training_features_df, - training_labels_df, - labelname, - ) -PredictMD.open_plot(knetmlpreg_plot_training) - -# Plot true values versus predicted values for multilayer perceptron on testing set -knetmlpreg_plot_testing = PredictMD.plotsinglelabelregressiontrueversuspredicted( - knetmlpreg, - testing_features_df, - testing_labels_df, - labelname, - ) -PredictMD.open_plot(knetmlpreg_plot_testing) - -# Evaluate performance of multilayer perceptron on training set -PredictMD.singlelabelregressionmetrics( - knetmlpreg, - training_features_df, - training_labels_df, - labelname, - ) - -# Evaluate performance of multilayer perceptron on testing set -PredictMD.singlelabelregressionmetrics( - knetmlpreg, - testing_features_df, - testing_labels_df, - labelname, - ) - -############################################################################## -############################################################################## -### Section 4: Compare performance of all models ############################# -############################################################################## -############################################################################## - -all_models = PredictMD.Fittable[ - linearreg, - randomforestreg, - knetmlpreg, - ] - -# Compare performance of all five models on training set -showall(PredictMD.singlelabelregressionmetrics( - all_models, - training_features_df, - training_labels_df, - labelname, - )) - -# Compare performance of all models on testing set -showall(PredictMD.singlelabelregressionmetrics( - all_models, - testing_features_df, - testing_labels_df, - labelname, - )) - -############################################################################## -############################################################################## -### Section 5: Save trained models to file (if desired) ####################### -############################################################################## -############################################################################## - -if get(ENV, "SAVETRAINEDMODELSTOFILE", "") == "true" - PredictMD.save_model(linearreg_filename, linearreg) - PredictMD.save_model(randomforestreg_filename, randomforestreg) - PredictMD.save_model(knetmlpreg_filename, knetmlpreg) -end - -############################################################################## -############################################################################## -## Appendix A: Directly access the output of regression models ############### -############################################################################## -############################################################################## - -# We can use the PredictMD.predict() function to get the real-valued predictions -# output by each of regression models. - -# Get real-valued predictions from each model for training set -PredictMD.predict(linearreg,training_features_df,) -PredictMD.predict(randomforestreg,training_features_df,) -PredictMD.predict(knetmlpreg,training_features_df,) - -# Get real-valued predictions from each model for testing set -PredictMD.predict(linearreg,testing_features_df,) -PredictMD.predict(randomforestreg,testing_features_df,) -PredictMD.predict(knetmlpreg,testing_features_df,) - - -Base.Test.@test(isfile(ENV["linearreg_filename"])) -Base.Test.@test(isfile(ENV["randomforestreg_filename"])) -Base.Test.@test(isfile(ENV["knetmlpreg_filename"])) diff --git a/examples/boston_housing/compare_models.jl b/examples/boston_housing/compare_models.jl new file mode 100644 index 000000000..00d8669fb --- /dev/null +++ b/examples/boston_housing/compare_models.jl @@ -0,0 +1,83 @@ +srand(999) + +import PredictMD +import CSV +import DataFrames +import Knet + +trainingandvalidation_features_df_filename = + ENV["trainingandvalidation_features_df_filename"] +trainingandvalidation_labels_df_filename = + ENV["trainingandvalidation_labels_df_filename"] +testing_features_df_filename = + ENV["testing_features_df_filename"] +testing_labels_df_filename = + ENV["testing_labels_df_filename"] +training_features_df_filename = + ENV["training_features_df_filename"] +training_labels_df_filename = + ENV["training_labels_df_filename"] +validation_features_df_filename = + ENV["validation_features_df_filename"] +validation_labels_df_filename = + ENV["validation_labels_df_filename"] +trainingandvalidation_features_df = CSV.read( + trainingandvalidation_features_df_filename, + DataFrames.DataFrame, + ) +trainingandvalidation_labels_df = CSV.read( + trainingandvalidation_labels_df_filename, + DataFrames.DataFrame, + ) +testing_features_df = CSV.read( + testing_features_df_filename, + DataFrames.DataFrame, + ) +testing_labels_df = CSV.read( + testing_labels_df_filename, + DataFrames.DataFrame, + ) +training_features_df = CSV.read( + training_features_df_filename, + DataFrames.DataFrame, + ) +training_features_df = CSV.read( + training_features_df_filename, + DataFrames.DataFrame, + ) +validation_features_df = CSV.read( + validation_features_df_filename, + DataFrames.DataFrame, + ) +validation_labels_df = CSV.read( + validation_labels_df_filename, + DataFrames.DataFrame, + ) + +linear_regression_filename = ENV["linear_regression_filename"] +random_forest_regression_filename = ENV["random_forest_regression_filename"] +knetmlpreg_filename = ENV["knet_mlp_regression_filename"] + +linearreg = PredictMD.load_model(linearreg_filename) +randomforestreg = PredictMD.load_model(randomforestreg_filename) +knetmlpreg = PredictMD.load_model(knetmlpreg_filename) + +all_models = PredictMD.Fittable[ + linearreg, + randomforestreg, + knetmlpreg, + ] + +showall(PredictMD.singlelabelregressionmetrics( + all_models, + training_features_df, + training_labels_df, + labelname, + )) + +showall(PredictMD.singlelabelregressionmetrics( + all_models, + testing_features_df, + testing_labels_df, + labelname, + )) diff --git a/examples/boston_housing/get_model_output.jl b/examples/boston_housing/get_model_output.jl new file mode 100644 index 000000000..cff3c996a --- /dev/null +++ b/examples/boston_housing/get_model_output.jl @@ -0,0 +1,71 @@ +srand(999) + +import PredictMD +import CSV +import DataFrames +import Knet + +trainingandvalidation_features_df_filename = + ENV["trainingandvalidation_features_df_filename"] +trainingandvalidation_labels_df_filename = + ENV["trainingandvalidation_labels_df_filename"] +testing_features_df_filename = + ENV["testing_features_df_filename"] +testing_labels_df_filename = + ENV["testing_labels_df_filename"] +training_features_df_filename = + ENV["training_features_df_filename"] +training_labels_df_filename = + ENV["training_labels_df_filename"] +validation_features_df_filename = + ENV["validation_features_df_filename"] +validation_labels_df_filename = + ENV["validation_labels_df_filename"] +trainingandvalidation_features_df = CSV.read( + trainingandvalidation_features_df_filename, + DataFrames.DataFrame, + ) +trainingandvalidation_labels_df = CSV.read( + trainingandvalidation_labels_df_filename, + DataFrames.DataFrame, + ) +testing_features_df = CSV.read( + testing_features_df_filename, + DataFrames.DataFrame, + ) +testing_labels_df = CSV.read( + testing_labels_df_filename, + DataFrames.DataFrame, + ) +training_features_df = CSV.read( + training_features_df_filename, + DataFrames.DataFrame, + ) +training_features_df = CSV.read( + training_features_df_filename, + DataFrames.DataFrame, + ) +validation_features_df = CSV.read( + validation_features_df_filename, + DataFrames.DataFrame, + ) +validation_labels_df = CSV.read( + validation_labels_df_filename, + DataFrames.DataFrame, + ) + +linear_regression_filename = ENV["linear_regression_filename"] +random_forest_regression_filename = ENV["random_forest_regression_filename"] +knetmlpreg_filename = ENV["knet_mlp_regression_filename"] + +linearreg = PredictMD.load_model(linearreg_filename) +randomforestreg = PredictMD.load_model(randomforestreg_filename) +knetmlpreg = PredictMD.load_model(knetmlpreg_filename) + +PredictMD.predict(linearreg,training_features_df,) +PredictMD.predict(randomforestreg,training_features_df,) +PredictMD.predict(knetmlpreg,training_features_df,) + +PredictMD.predict(linearreg,testing_features_df,) +PredictMD.predict(randomforestreg,testing_features_df,) +PredictMD.predict(knetmlpreg,testing_features_df,) diff --git a/examples/boston_housing/knet_mlp_regression.jl b/examples/boston_housing/knet_mlp_regression.jl new file mode 100644 index 000000000..3b3faccf6 --- /dev/null +++ b/examples/boston_housing/knet_mlp_regression.jl @@ -0,0 +1,215 @@ +srand(999) + +import PredictMD +import CSV +import DataFrames +import Knet + +trainingandvalidation_features_df_filename = + ENV["trainingandvalidation_features_df_filename"] +trainingandvalidation_labels_df_filename = + ENV["trainingandvalidation_labels_df_filename"] +testing_features_df_filename = + ENV["testing_features_df_filename"] +testing_labels_df_filename = + ENV["testing_labels_df_filename"] +training_features_df_filename = + ENV["training_features_df_filename"] +training_labels_df_filename = + ENV["training_labels_df_filename"] +validation_features_df_filename = + ENV["validation_features_df_filename"] +validation_labels_df_filename = + ENV["validation_labels_df_filename"] +trainingandvalidation_features_df = CSV.read( + trainingandvalidation_features_df_filename, + DataFrames.DataFrame, + ) +trainingandvalidation_labels_df = CSV.read( + trainingandvalidation_labels_df_filename, + DataFrames.DataFrame, + ) +testing_features_df = CSV.read( + testing_features_df_filename, + DataFrames.DataFrame, + ) +testing_labels_df = CSV.read( + testing_labels_df_filename, + DataFrames.DataFrame, + ) +training_features_df = CSV.read( + training_features_df_filename, + DataFrames.DataFrame, + ) +training_features_df = CSV.read( + training_features_df_filename, + DataFrames.DataFrame, + ) +validation_features_df = CSV.read( + validation_features_df_filename, + DataFrames.DataFrame, + ) +validation_labels_df = CSV.read( + validation_labels_df_filename, + DataFrames.DataFrame, + ) + +ENV["knet_mlp_regression_filename"] = string( + tempname(), + "_knet_mlp_regression.jld2", + ) +Base.Test.@test(!isfile(ENV["knet_mlp_regression_filename"])) +knet_mlp_regression = ENV["knet_mlp_regression_filename"] + +function knetmlp_predict( + w, # don't put a type annotation on this + x0::AbstractArray, + ) + # x0 = input layer + # x1 = hidden layer + x1 = Knet.relu.( w[1]*x0 .+ w[2] ) # w[1] = weights, w[2] = biases + # x2 = output layer + x2 = w[3]*x1 .+ w[4] # w[3] = weights, w[4] = biases + return x2 +end + +function knetmlp_loss( + predict_function::Function, + modelweights, # don't put a type annotation on this + x::AbstractArray, + ytrue::AbstractArray; + L1::Real = Cfloat(0), + L2::Real = Cfloat(0), + ) + loss = mean( + abs2, + ytrue - predict_function( + modelweights, + x, + ), + ) + if L1 != 0 + loss += L1 * sum(sum(abs, w_i) for w_i in modelweights[1:2:end]) + end + if L2 != 0 + loss += L2 * sum(sum(abs2, w_i) for w_i in modelweights[1:2:end]) + end + return loss +end + +knetmlp_modelweights = Any[ + # input layer has dimension contrasts.num_array_columns + # + # hidden layer (10 neurons): + Cfloat.( + 0.1f0*randn(Cfloat,10,feature_contrasts.num_array_columns) # weights + ), + Cfloat.( + zeros(Cfloat,10,1) # biases + ), + # + # output layer (regression nets have exactly 1 neuron in output layer): + Cfloat.( + 0.1f0*randn(Cfloat,1,10) # weights + ), + Cfloat.( + zeros(Cfloat,1,1) # biases + ), + ] + +knetmlp_losshyperparameters = Dict() +knetmlp_losshyperparameters[:L1] = Cfloat(0.0) +knetmlp_losshyperparameters[:L2] = Cfloat(0.0) +knetmlp_optimizationalgorithm = :Adam +knetmlp_optimizerhyperparameters = Dict() +knetmlp_minibatchsize = 48 +knetmlp_maxepochs = 1_000 +knetmlpreg = PredictMD.singlelabeldataframeknetregression( + featurenames, + labelname; + package = :Knetjl, + name = "Knet MLP", + predict = knetmlp_predict, + loss = knetmlp_loss, + losshyperparameters = knetmlp_losshyperparameters, + optimizationalgorithm = knetmlp_optimizationalgorithm, + optimizerhyperparameters = knetmlp_optimizerhyperparameters, + minibatchsize = knetmlp_minibatchsize, + modelweights = knetmlp_modelweights, + maxepochs = knetmlp_maxepochs, + printlosseverynepochs = 100, # if 0, will not print at all + feature_contrasts = feature_contrasts, + ) + +PredictMD.fit!( + knetmlpreg, + training_features_df, + training_labels_df, + validation_features_df, + validation_labels_df, + ) + + +knet_learningcurve_lossvsepoch = PredictMD.plotlearningcurve( + knetmlpreg, + :loss_vs_epoch; + ) +PredictMD.open_plot(knet_learningcurve_lossvsepoch) + +knet_learningcurve_lossvsepoch_skip10epochs = PredictMD.plotlearningcurve( + knetmlpreg, + :loss_vs_epoch; + startat = 10, + endat = :end, + ) +PredictMD.open_plot(knet_learningcurve_lossvsepoch_skip10epochs) + +knet_learningcurve_lossvsiteration = PredictMD.plotlearningcurve( + knetmlpreg, + :loss_vs_iteration; + window = 50, + sampleevery = 10, + ) +PredictMD.open_plot(knet_learningcurve_lossvsiteration) + +knet_learningcurve_lossvsiteration_skip100iterations = PredictMD.plotlearningcurve( + knetmlpreg, + :loss_vs_iteration; + window = 50, + sampleevery = 10, + startat = 100, + endat = :end, + ) +PredictMD.open_plot(knet_learningcurve_lossvsiteration_skip100iterations) + +knetmlpreg_plot_training = PredictMD.plotsinglelabelregressiontrueversuspredicted( + knetmlpreg, + training_features_df, + training_labels_df, + labelname, + ) +PredictMD.open_plot(knetmlpreg_plot_training) + +knetmlpreg_plot_testing = PredictMD.plotsinglelabelregressiontrueversuspredicted( + knetmlpreg, + testing_features_df, + testing_labels_df, + labelname, + ) +PredictMD.open_plot(knetmlpreg_plot_testing) + +PredictMD.singlelabelregressionmetrics( + knetmlpreg, + training_features_df, + training_labels_df, + labelname, + ) + +PredictMD.singlelabelregressionmetrics( + knetmlpreg, + testing_features_df, + testing_labels_df, + labelname, + ) + +PredictMD.save_model(knet_mlp_regression, knetmlpreg) diff --git a/examples/boston_housing/linear_regression.jl b/examples/boston_housing/linear_regression.jl new file mode 100644 index 000000000..c19c18fa4 --- /dev/null +++ b/examples/boston_housing/linear_regression.jl @@ -0,0 +1,106 @@ +srand(999) + +import PredictMD +import CSV +import DataFrames + +trainingandvalidation_features_df_filename = + ENV["trainingandvalidation_features_df_filename"] +trainingandvalidation_labels_df_filename = + ENV["trainingandvalidation_labels_df_filename"] +testing_features_df_filename = + ENV["testing_features_df_filename"] +testing_labels_df_filename = + ENV["testing_labels_df_filename"] +training_features_df_filename = + ENV["training_features_df_filename"] +training_labels_df_filename = + ENV["training_labels_df_filename"] +validation_features_df_filename = + ENV["validation_features_df_filename"] +validation_labels_df_filename = + ENV["validation_labels_df_filename"] +trainingandvalidation_features_df = CSV.read( + trainingandvalidation_features_df_filename, + DataFrames.DataFrame, + ) +trainingandvalidation_labels_df = CSV.read( + trainingandvalidation_labels_df_filename, + DataFrames.DataFrame, + ) +testing_features_df = CSV.read( + testing_features_df_filename, + DataFrames.DataFrame, + ) +testing_labels_df = CSV.read( + testing_labels_df_filename, + DataFrames.DataFrame, + ) +training_features_df = CSV.read( + training_features_df_filename, + DataFrames.DataFrame, + ) +training_features_df = CSV.read( + training_features_df_filename, + DataFrames.DataFrame, + ) +validation_features_df = CSV.read( + validation_features_df_filename, + DataFrames.DataFrame, + ) +validation_labels_df = CSV.read( + validation_labels_df_filename, + DataFrames.DataFrame, + ) + +ENV["linear_regression_filename"] = string( + tempname(), + "_linear_regression.jld2", + ) +Base.Test.@test(!isfile(ENV["linear_regression_filename"])) +linear_regression_filename = ENV["linear_regression_filename"] + +linear_regression = PredictMD.singlelabeldataframelinear_regressionression( + featurenames, + labelname; + package = :GLMjl, + intercept = true, # optional, defaults to true + interactions = 2, # optional, defaults to 1 + name = "Linear regression", # optional + ) + +PredictMD.fit!(linear_regression,training_features_df,training_labels_df,) + +PredictMD.get_underlying(linear_regression) + +linear_regression_plot_training = PredictMD.plotsinglelabelregressiontrueversuspredicted( + linear_regression, + training_features_df, + training_labels_df, + labelname, + ) +PredictMD.open_plot(linear_regression_plot_training) + +linear_regression_plot_testing = PredictMD.plotsinglelabelregressiontrueversuspredicted( + linear_regression, + testing_features_df, + testing_labels_df, + labelname + ) +PredictMD.open_plot(linear_regression_plot_testing) + +PredictMD.singlelabelregressionmetrics( + linear_regression, + training_features_df, + training_labels_df, + labelname, + ) + +PredictMD.singlelabelregressionmetrics( + linear_regression, + testing_features_df, + testing_labels_df, + labelname, + ) + +PredictMD.save_model(linear_regression_filename, linear_regression) diff --git a/examples/boston_housing/preprocess_data.jl b/examples/boston_housing/preprocess_data.jl new file mode 100644 index 000000000..4aedf1adb --- /dev/null +++ b/examples/boston_housing/preprocess_data.jl @@ -0,0 +1,148 @@ +srand(999) + +import PredictMD +import CSV +import DataFrames +import GZip +import StatsBase + +df = CSV.read( + GZip.gzopen( + joinpath( + Pkg.dir("RDatasets"), + "data", + "MASS", + "Boston.csv.gz", + ), + ), + DataFrames.DataFrame, + ) + +DataFrames.dropmissing!(df) + +PredictMD.shuffle_rows!(df) + +categoricalfeaturenames = Symbol[] +continuousfeaturenames = Symbol[ + :Crim, + :Zn, + :Indus, + :Chas, + :NOx, + :Rm, + :Age, + :Dis, + :Rad, + :Tax, + :PTRatio, + :Black, + :LStat, + ] +featurenames = vcat(categoricalfeaturenames, continuousfeaturenames) + +feature_contrasts = PredictMD.generate_feature_contrasts(df, featurenames) + +singlelabelname = :MedV +labelnames = [singlelabelname] + +features_df = df[featurenames] +labels_df = df[labelnames] + +DataFrames.describe(labels_df[singlelabelname]) + +trainingandvalidation_features_df, + trainingandvalidation_labels_df, + testing_features_df, + testing_labels_df = PredictMD.split_data( + features_df, + labels_df, + 0.75, # 75% training/validation, 25% testing + ) +training_features_df, + training_labels_df, + validation_features_df, + validation_labels_df = PredictMD.split_data( + trainingandvalidation_features_df, + trainingandvalidation_labels_df, + 2/3, # 2/3 of 75% = 50% training, 1/3 of 75% = 25% validation + ) +ENV["trainingandvalidation_features_df_filename"] = string( + tempname(), + "_trainingandvalidation_features_df.csv", + ) +ENV["trainingandvalidation_labels_df_filename"] = string( + tempname(), + "_trainingandvalidation_labels_df.csv", + ) +ENV["testing_features_df_filename"] = string( + tempname(), + "_testing_features_df.csv", + ) +ENV["testing_labels_df_filename"] = string( + tempname(), + "_.testing_labels_dfcsv", + ) +ENV["training_features_df_filename"] = string( + tempname(), + "_training_features_df.csv", + ) +ENV["training_labels_df_filename"] = string( + tempname(), + "_training_labels_df.csv", + ) +ENV["validation_features_df_filename"] = string( + tempname(), + "_validation_features_df.csv", + ) +ENV["validation_labels_df_filename"] = string( + tempname(), + "_validation_labels_df.csv", + ) +trainingandvalidation_features_df_filename = + ENV["trainingandvalidation_features_df_filename"] +trainingandvalidation_labels_df_filename = + ENV["trainingandvalidation_labels_df_filename"] +testing_features_df_filename = + ENV["testing_features_df_filename"] +testing_labels_df_filename = + ENV["testing_labels_df_filename"] +training_features_df_filename = + ENV["training_features_df_filename"] +training_labels_df_filename = + ENV["training_labels_df_filename"] +validation_features_df_filename = + ENV["validation_features_df_filename"] +validation_labels_df_filename = + ENV["validation_labels_df_filename"] +CSV.write( + trainingandvalidation_features_df_filename, + trainingandvalidation_features_df, + ) +CSV.write( + trainingandvalidation_labels_df_filename, + trainingandvalidation_labels_df, + ) +CSV.write( + testing_features_df_filename, + testing_features_df, + ) +CSV.write( + testing_labels_df_filename, + testing_labels_df, + ) +CSV.write( + training_features_df_filename, + training_features_df, + ) +CSV.write( + training_labels_df_filename, + training_labels_df, + ) +CSV.write( + validation_features_df_filename, + validation_features_df, + ) +CSV.write( + validation_labels_df_filename, + validation_labels_df, + ) diff --git a/examples/boston_housing/random_forest_regression.jl b/examples/boston_housing/random_forest_regression.jl new file mode 100644 index 000000000..f2a2f3c77 --- /dev/null +++ b/examples/boston_housing/random_forest_regression.jl @@ -0,0 +1,105 @@ +srand(999) + +import PredictMD +import CSV +import DataFrames + +trainingandvalidation_features_df_filename = + ENV["trainingandvalidation_features_df_filename"] +trainingandvalidation_labels_df_filename = + ENV["trainingandvalidation_labels_df_filename"] +testing_features_df_filename = + ENV["testing_features_df_filename"] +testing_labels_df_filename = + ENV["testing_labels_df_filename"] +training_features_df_filename = + ENV["training_features_df_filename"] +training_labels_df_filename = + ENV["training_labels_df_filename"] +validation_features_df_filename = + ENV["validation_features_df_filename"] +validation_labels_df_filename = + ENV["validation_labels_df_filename"] +trainingandvalidation_features_df = CSV.read( + trainingandvalidation_features_df_filename, + DataFrames.DataFrame, + ) +trainingandvalidation_labels_df = CSV.read( + trainingandvalidation_labels_df_filename, + DataFrames.DataFrame, + ) +testing_features_df = CSV.read( + testing_features_df_filename, + DataFrames.DataFrame, + ) +testing_labels_df = CSV.read( + testing_labels_df_filename, + DataFrames.DataFrame, + ) +training_features_df = CSV.read( + training_features_df_filename, + DataFrames.DataFrame, + ) +training_features_df = CSV.read( + training_features_df_filename, + DataFrames.DataFrame, + ) +validation_features_df = CSV.read( + validation_features_df_filename, + DataFrames.DataFrame, + ) +validation_labels_df = CSV.read( + validation_labels_df_filename, + DataFrames.DataFrame, + ) + +ENV["random_forest_regression_filename"] = string( + tempname(), + "_random_forest_regression.jld2", + ) +Base.Test.@test(!isfile(ENV["random_forest_regression_filename"])) +random_forest_regression_filename = ENV["random_forest_regression_filename"] + +random_forest_regression = PredictMD.singlelabeldataframerandom_forest_regressionression( + featurenames, + labelname; + nsubfeatures = 2, # number of subfeatures; defaults to 2 + ntrees = 20, # number of trees; defaults to 10 + package = :DecisionTreejl, + name = "Random forest", # optional + feature_contrasts = feature_contrasts, + ) + +PredictMD.fit!(random_forest_regression,training_features_df,training_labels_df,) + +random_forest_regression_plot_training = PredictMD.plotsinglelabelregressiontrueversuspredicted( + random_forest_regression, + training_features_df, + training_labels_df, + labelname, + ) +PredictMD.open_plot(random_forest_regression_plot_training) + +random_forest_regression_plot_testing = PredictMD.plotsinglelabelregressiontrueversuspredicted( + random_forest_regression, + testing_features_df, + testing_labels_df, + labelname, + ) +PredictMD.open_plot(random_forest_regression_plot_testing) + +PredictMD.singlelabelregressionmetrics( + random_forest_regression, + training_features_df, + training_labels_df, + labelname, + ) + +PredictMD.singlelabelregressionmetrics( + random_forest_regression, + testing_features_df, + testing_labels_df, + labelname, + ) + +PredictMD.save_model(random_forest_regression_filename, random_forest_regression) From 7e22ec6aad8128c73a13cc1cda966b0e406fa8ca Mon Sep 17 00:00:00 2001 From: Dilum Aluthge Date: Mon, 21 May 2018 23:27:25 -0400 Subject: [PATCH 34/62] Progress commit --- examples/boston_housing/OLD_boston_housing.jl | 0 1 file changed, 0 insertions(+), 0 deletions(-) delete mode 100644 examples/boston_housing/OLD_boston_housing.jl diff --git a/examples/boston_housing/OLD_boston_housing.jl b/examples/boston_housing/OLD_boston_housing.jl deleted file mode 100644 index e69de29bb..000000000 From 0bfc5132c8458555da732d9bbffa6be6bdec4d46 Mon Sep 17 00:00:00 2001 From: Dilum Aluthge Date: Mon, 21 May 2018 23:28:45 -0400 Subject: [PATCH 35/62] Progress commit --- examples/boston_housing/compare_models.jl | 8 +++ examples/boston_housing/get_model_output.jl | 71 --------------------- 2 files changed, 8 insertions(+), 71 deletions(-) diff --git a/examples/boston_housing/compare_models.jl b/examples/boston_housing/compare_models.jl index 00d8669fb..cd4b6a05e 100644 --- a/examples/boston_housing/compare_models.jl +++ b/examples/boston_housing/compare_models.jl @@ -81,3 +81,11 @@ showall(PredictMD.singlelabelregressionmetrics( testing_labels_df, labelname, )) + +PredictMD.predict(linearreg,training_features_df,) +PredictMD.predict(randomforestreg,training_features_df,) +PredictMD.predict(knetmlpreg,training_features_df,) + +PredictMD.predict(linearreg,testing_features_df,) +PredictMD.predict(randomforestreg,testing_features_df,) +PredictMD.predict(knetmlpreg,testing_features_df,) diff --git a/examples/boston_housing/get_model_output.jl b/examples/boston_housing/get_model_output.jl index cff3c996a..e69de29bb 100644 --- a/examples/boston_housing/get_model_output.jl +++ b/examples/boston_housing/get_model_output.jl @@ -1,71 +0,0 @@ -srand(999) - -import PredictMD -import CSV -import DataFrames -import Knet - -trainingandvalidation_features_df_filename = - ENV["trainingandvalidation_features_df_filename"] -trainingandvalidation_labels_df_filename = - ENV["trainingandvalidation_labels_df_filename"] -testing_features_df_filename = - ENV["testing_features_df_filename"] -testing_labels_df_filename = - ENV["testing_labels_df_filename"] -training_features_df_filename = - ENV["training_features_df_filename"] -training_labels_df_filename = - ENV["training_labels_df_filename"] -validation_features_df_filename = - ENV["validation_features_df_filename"] -validation_labels_df_filename = - ENV["validation_labels_df_filename"] -trainingandvalidation_features_df = CSV.read( - trainingandvalidation_features_df_filename, - DataFrames.DataFrame, - ) -trainingandvalidation_labels_df = CSV.read( - trainingandvalidation_labels_df_filename, - DataFrames.DataFrame, - ) -testing_features_df = CSV.read( - testing_features_df_filename, - DataFrames.DataFrame, - ) -testing_labels_df = CSV.read( - testing_labels_df_filename, - DataFrames.DataFrame, - ) -training_features_df = CSV.read( - training_features_df_filename, - DataFrames.DataFrame, - ) -training_features_df = CSV.read( - training_features_df_filename, - DataFrames.DataFrame, - ) -validation_features_df = CSV.read( - validation_features_df_filename, - DataFrames.DataFrame, - ) -validation_labels_df = CSV.read( - validation_labels_df_filename, - DataFrames.DataFrame, - ) - -linear_regression_filename = ENV["linear_regression_filename"] -random_forest_regression_filename = ENV["random_forest_regression_filename"] -knetmlpreg_filename = ENV["knet_mlp_regression_filename"] - -linearreg = PredictMD.load_model(linearreg_filename) -randomforestreg = PredictMD.load_model(randomforestreg_filename) -knetmlpreg = PredictMD.load_model(knetmlpreg_filename) - -PredictMD.predict(linearreg,training_features_df,) -PredictMD.predict(randomforestreg,training_features_df,) -PredictMD.predict(knetmlpreg,training_features_df,) - -PredictMD.predict(linearreg,testing_features_df,) -PredictMD.predict(randomforestreg,testing_features_df,) -PredictMD.predict(knetmlpreg,testing_features_df,) From 5dec37c51cc7c94253ded54fd53a716c3d80b86c Mon Sep 17 00:00:00 2001 From: Dilum Aluthge Date: Mon, 21 May 2018 23:29:03 -0400 Subject: [PATCH 36/62] Progress commit --- examples/boston_housing/get_model_output.jl | 0 1 file changed, 0 insertions(+), 0 deletions(-) delete mode 100644 examples/boston_housing/get_model_output.jl diff --git a/examples/boston_housing/get_model_output.jl b/examples/boston_housing/get_model_output.jl deleted file mode 100644 index e69de29bb..000000000 From f878a3ba9de8d0453ffc6c40fdcb702bed059a95 Mon Sep 17 00:00:00 2001 From: Dilum Aluthge Date: Tue, 22 May 2018 00:06:17 -0400 Subject: [PATCH 37/62] Major progress commit --- examples/boston_housing/compare_models.jl | 65 +- examples/boston_housing/get_model_output.jl | 108 ++ .../boston_housing/knet_mlp_regression.jl | 36 +- examples/boston_housing/linear_regression.jl | 3 +- examples/boston_housing/preprocess_data.jl | 5 +- .../random_forest_regression.jl | 5 +- .../OLD_breast_cancer_biopsy.jl | 1709 ----------------- .../c_svc_svm_classifier.jl | 129 ++ .../breast_cancer_biopsy/compare_models.jl | 202 ++ .../breast_cancer_biopsy/get_model_output.jl | 134 ++ .../knet_mlp_classifier.jl | 242 +++ .../logistic_classifier.jl | 181 ++ .../nu_svc_svm_classifier.jl | 129 ++ .../breast_cancer_biopsy/preprocess_data.jl | 135 ++ .../random_forest_classifier.jl | 129 ++ examples/breast_cancer_biopsy/smote.jl | 98 + 16 files changed, 1563 insertions(+), 1747 deletions(-) create mode 100644 examples/boston_housing/get_model_output.jl create mode 100644 examples/breast_cancer_biopsy/c_svc_svm_classifier.jl create mode 100644 examples/breast_cancer_biopsy/compare_models.jl create mode 100644 examples/breast_cancer_biopsy/get_model_output.jl create mode 100644 examples/breast_cancer_biopsy/knet_mlp_classifier.jl create mode 100644 examples/breast_cancer_biopsy/logistic_classifier.jl create mode 100644 examples/breast_cancer_biopsy/nu_svc_svm_classifier.jl create mode 100644 examples/breast_cancer_biopsy/preprocess_data.jl create mode 100644 examples/breast_cancer_biopsy/random_forest_classifier.jl create mode 100644 examples/breast_cancer_biopsy/smote.jl diff --git a/examples/boston_housing/compare_models.jl b/examples/boston_housing/compare_models.jl index cd4b6a05e..0a1ec1519 100644 --- a/examples/boston_housing/compare_models.jl +++ b/examples/boston_housing/compare_models.jl @@ -1,9 +1,9 @@ srand(999) -import PredictMD import CSV import DataFrames import Knet +import PredictMD trainingandvalidation_features_df_filename = ENV["trainingandvalidation_features_df_filename"] @@ -56,16 +56,53 @@ validation_labels_df = CSV.read( linear_regression_filename = ENV["linear_regression_filename"] random_forest_regression_filename = ENV["random_forest_regression_filename"] -knetmlpreg_filename = ENV["knet_mlp_regression_filename"] +knet_mlp_regression_filename = ENV["knet_mlp_regression_filename"] + +linear_regression = PredictMD.load_model(linear_regression_filename) +random_forest_reg = PredictMD.load_model(random_forest_reg_filename) + +function knetmlp_predict( + w, # don't put a type annotation on this + x0::AbstractArray, + ) + # x0 = input layer + # x1 = hidden layer + x1 = Knet.relu.( w[1]*x0 .+ w[2] ) # w[1] = weights, w[2] = biases + # x2 = output layer + x2 = w[3]*x1 .+ w[4] # w[3] = weights, w[4] = biases + return x2 +end + +function knetmlp_loss( + predict_function::Function, + modelweights, # don't put a type annotation on this + x::AbstractArray, + ytrue::AbstractArray; + L1::Real = Cfloat(0), + L2::Real = Cfloat(0), + ) + loss = mean( + abs2, + ytrue - predict_function( + modelweights, + x, + ), + ) + if L1 != 0 + loss += L1 * sum(sum(abs, w_i) for w_i in modelweights[1:2:end]) + end + if L2 != 0 + loss += L2 * sum(sum(abs2, w_i) for w_i in modelweights[1:2:end]) + end + return loss +end -linearreg = PredictMD.load_model(linearreg_filename) -randomforestreg = PredictMD.load_model(randomforestreg_filename) -knetmlpreg = PredictMD.load_model(knetmlpreg_filename) +knet_mlp_regression = PredictMD.load_model(knet_mlp_regression_filename) all_models = PredictMD.Fittable[ - linearreg, - randomforestreg, - knetmlpreg, + linear_regression, + random_forest_reg, + knet_mlp_regression, ] showall(PredictMD.singlelabelregressionmetrics( @@ -82,10 +119,10 @@ showall(PredictMD.singlelabelregressionmetrics( labelname, )) -PredictMD.predict(linearreg,training_features_df,) -PredictMD.predict(randomforestreg,training_features_df,) -PredictMD.predict(knetmlpreg,training_features_df,) +PredictMD.predict(linear_regression,training_features_df,) +PredictMD.predict(random_forest_reg,training_features_df,) +PredictMD.predict(knet_mlp_regression,training_features_df,) -PredictMD.predict(linearreg,testing_features_df,) -PredictMD.predict(randomforestreg,testing_features_df,) -PredictMD.predict(knetmlpreg,testing_features_df,) +PredictMD.predict(linear_regression,testing_features_df,) +PredictMD.predict(random_forest_reg,testing_features_df,) +PredictMD.predict(knet_mlp_regression,testing_features_df,) diff --git a/examples/boston_housing/get_model_output.jl b/examples/boston_housing/get_model_output.jl new file mode 100644 index 000000000..f86c3aa9f --- /dev/null +++ b/examples/boston_housing/get_model_output.jl @@ -0,0 +1,108 @@ +srand(999) + +import CSV +import DataFrames +import Knet +import PredictMD + +trainingandvalidation_features_df_filename = + ENV["trainingandvalidation_features_df_filename"] +trainingandvalidation_labels_df_filename = + ENV["trainingandvalidation_labels_df_filename"] +testing_features_df_filename = + ENV["testing_features_df_filename"] +testing_labels_df_filename = + ENV["testing_labels_df_filename"] +training_features_df_filename = + ENV["training_features_df_filename"] +training_labels_df_filename = + ENV["training_labels_df_filename"] +validation_features_df_filename = + ENV["validation_features_df_filename"] +validation_labels_df_filename = + ENV["validation_labels_df_filename"] +trainingandvalidation_features_df = CSV.read( + trainingandvalidation_features_df_filename, + DataFrames.DataFrame, + ) +trainingandvalidation_labels_df = CSV.read( + trainingandvalidation_labels_df_filename, + DataFrames.DataFrame, + ) +testing_features_df = CSV.read( + testing_features_df_filename, + DataFrames.DataFrame, + ) +testing_labels_df = CSV.read( + testing_labels_df_filename, + DataFrames.DataFrame, + ) +training_features_df = CSV.read( + training_features_df_filename, + DataFrames.DataFrame, + ) +training_features_df = CSV.read( + training_features_df_filename, + DataFrames.DataFrame, + ) +validation_features_df = CSV.read( + validation_features_df_filename, + DataFrames.DataFrame, + ) +validation_labels_df = CSV.read( + validation_labels_df_filename, + DataFrames.DataFrame, + ) + +linear_regression_filename = ENV["linear_regression_filename"] +random_forest_regression_filename = ENV["random_forest_regression_filename"] +knet_mlp_regression_filename = ENV["knet_mlp_regression_filename"] + +linear_regression = PredictMD.load_model(linear_regression_filename) +random_forest_reg = PredictMD.load_model(random_forest_reg_filename) + +function knetmlp_predict( + w, # don't put a type annotation on this + x0::AbstractArray, + ) + # x0 = input layer + # x1 = hidden layer + x1 = Knet.relu.( w[1]*x0 .+ w[2] ) # w[1] = weights, w[2] = biases + # x2 = output layer + x2 = w[3]*x1 .+ w[4] # w[3] = weights, w[4] = biases + return x2 +end + +function knetmlp_loss( + predict_function::Function, + modelweights, # don't put a type annotation on this + x::AbstractArray, + ytrue::AbstractArray; + L1::Real = Cfloat(0), + L2::Real = Cfloat(0), + ) + loss = mean( + abs2, + ytrue - predict_function( + modelweights, + x, + ), + ) + if L1 != 0 + loss += L1 * sum(sum(abs, w_i) for w_i in modelweights[1:2:end]) + end + if L2 != 0 + loss += L2 * sum(sum(abs2, w_i) for w_i in modelweights[1:2:end]) + end + return loss +end + +knet_mlp_regression = PredictMD.load_model(knet_mlp_regression_filename) + +PredictMD.predict(linear_regression,training_features_df,) +PredictMD.predict(random_forest_reg,training_features_df,) +PredictMD.predict(knet_mlp_regression,training_features_df,) + +PredictMD.predict(linear_regression,testing_features_df,) +PredictMD.predict(random_forest_reg,testing_features_df,) +PredictMD.predict(knet_mlp_regression,testing_features_df,) diff --git a/examples/boston_housing/knet_mlp_regression.jl b/examples/boston_housing/knet_mlp_regression.jl index 3b3faccf6..277cdf1eb 100644 --- a/examples/boston_housing/knet_mlp_regression.jl +++ b/examples/boston_housing/knet_mlp_regression.jl @@ -1,9 +1,9 @@ srand(999) -import PredictMD import CSV import DataFrames import Knet +import PredictMD trainingandvalidation_features_df_filename = ENV["trainingandvalidation_features_df_filename"] @@ -58,7 +58,6 @@ ENV["knet_mlp_regression_filename"] = string( tempname(), "_knet_mlp_regression.jld2", ) -Base.Test.@test(!isfile(ENV["knet_mlp_regression_filename"])) knet_mlp_regression = ENV["knet_mlp_regression_filename"] function knetmlp_predict( @@ -124,7 +123,10 @@ knetmlp_optimizationalgorithm = :Adam knetmlp_optimizerhyperparameters = Dict() knetmlp_minibatchsize = 48 knetmlp_maxepochs = 1_000 -knetmlpreg = PredictMD.singlelabeldataframeknetregression( + +feature_contrasts = PredictMD.generate_feature_contrasts(training_features_df, featurenames) + +knet_mlp_regression = PredictMD.singlelabeldataframeknetregression( featurenames, labelname; package = :Knetjl, @@ -142,7 +144,7 @@ knetmlpreg = PredictMD.singlelabeldataframeknetregression( ) PredictMD.fit!( - knetmlpreg, + knet_mlp_regression, training_features_df, training_labels_df, validation_features_df, @@ -151,13 +153,13 @@ PredictMD.fit!( knet_learningcurve_lossvsepoch = PredictMD.plotlearningcurve( - knetmlpreg, + knet_mlp_regression, :loss_vs_epoch; ) PredictMD.open_plot(knet_learningcurve_lossvsepoch) knet_learningcurve_lossvsepoch_skip10epochs = PredictMD.plotlearningcurve( - knetmlpreg, + knet_mlp_regression, :loss_vs_epoch; startat = 10, endat = :end, @@ -165,7 +167,7 @@ knet_learningcurve_lossvsepoch_skip10epochs = PredictMD.plotlearningcurve( PredictMD.open_plot(knet_learningcurve_lossvsepoch_skip10epochs) knet_learningcurve_lossvsiteration = PredictMD.plotlearningcurve( - knetmlpreg, + knet_mlp_regression, :loss_vs_iteration; window = 50, sampleevery = 10, @@ -173,7 +175,7 @@ knet_learningcurve_lossvsiteration = PredictMD.plotlearningcurve( PredictMD.open_plot(knet_learningcurve_lossvsiteration) knet_learningcurve_lossvsiteration_skip100iterations = PredictMD.plotlearningcurve( - knetmlpreg, + knet_mlp_regression, :loss_vs_iteration; window = 50, sampleevery = 10, @@ -182,34 +184,34 @@ knet_learningcurve_lossvsiteration_skip100iterations = PredictMD.plotlearningcur ) PredictMD.open_plot(knet_learningcurve_lossvsiteration_skip100iterations) -knetmlpreg_plot_training = PredictMD.plotsinglelabelregressiontrueversuspredicted( - knetmlpreg, +knet_mlp_regression_plot_training = PredictMD.plotsinglelabelregressiontrueversuspredicted( + knet_mlp_regression, training_features_df, training_labels_df, labelname, ) -PredictMD.open_plot(knetmlpreg_plot_training) +PredictMD.open_plot(knet_mlp_regression_plot_training) -knetmlpreg_plot_testing = PredictMD.plotsinglelabelregressiontrueversuspredicted( - knetmlpreg, +knet_mlp_regression_plot_testing = PredictMD.plotsinglelabelregressiontrueversuspredicted( + knet_mlp_regression, testing_features_df, testing_labels_df, labelname, ) -PredictMD.open_plot(knetmlpreg_plot_testing) +PredictMD.open_plot(knet_mlp_regression_plot_testing) PredictMD.singlelabelregressionmetrics( - knetmlpreg, + knet_mlp_regression, training_features_df, training_labels_df, labelname, ) PredictMD.singlelabelregressionmetrics( - knetmlpreg, + knet_mlp_regression, testing_features_df, testing_labels_df, labelname, ) -PredictMD.save_model(knet_mlp_regression, knetmlpreg) +PredictMD.save_model(knet_mlp_regression, knet_mlp_regression) diff --git a/examples/boston_housing/linear_regression.jl b/examples/boston_housing/linear_regression.jl index c19c18fa4..a47ed1425 100644 --- a/examples/boston_housing/linear_regression.jl +++ b/examples/boston_housing/linear_regression.jl @@ -1,8 +1,8 @@ srand(999) -import PredictMD import CSV import DataFrames +import PredictMD trainingandvalidation_features_df_filename = ENV["trainingandvalidation_features_df_filename"] @@ -57,7 +57,6 @@ ENV["linear_regression_filename"] = string( tempname(), "_linear_regression.jld2", ) -Base.Test.@test(!isfile(ENV["linear_regression_filename"])) linear_regression_filename = ENV["linear_regression_filename"] linear_regression = PredictMD.singlelabeldataframelinear_regressionression( diff --git a/examples/boston_housing/preprocess_data.jl b/examples/boston_housing/preprocess_data.jl index 4aedf1adb..8d1406ca6 100644 --- a/examples/boston_housing/preprocess_data.jl +++ b/examples/boston_housing/preprocess_data.jl @@ -1,9 +1,9 @@ srand(999) -import PredictMD import CSV import DataFrames import GZip +import PredictMD import StatsBase df = CSV.read( @@ -40,8 +40,6 @@ continuousfeaturenames = Symbol[ ] featurenames = vcat(categoricalfeaturenames, continuousfeaturenames) -feature_contrasts = PredictMD.generate_feature_contrasts(df, featurenames) - singlelabelname = :MedV labelnames = [singlelabelname] @@ -66,6 +64,7 @@ training_features_df, trainingandvalidation_labels_df, 2/3, # 2/3 of 75% = 50% training, 1/3 of 75% = 25% validation ) + ENV["trainingandvalidation_features_df_filename"] = string( tempname(), "_trainingandvalidation_features_df.csv", diff --git a/examples/boston_housing/random_forest_regression.jl b/examples/boston_housing/random_forest_regression.jl index f2a2f3c77..41ad9a7e5 100644 --- a/examples/boston_housing/random_forest_regression.jl +++ b/examples/boston_housing/random_forest_regression.jl @@ -1,8 +1,8 @@ srand(999) -import PredictMD import CSV import DataFrames +import PredictMD trainingandvalidation_features_df_filename = ENV["trainingandvalidation_features_df_filename"] @@ -57,9 +57,10 @@ ENV["random_forest_regression_filename"] = string( tempname(), "_random_forest_regression.jld2", ) -Base.Test.@test(!isfile(ENV["random_forest_regression_filename"])) random_forest_regression_filename = ENV["random_forest_regression_filename"] +feature_contrasts = PredictMD.generate_feature_contrasts(training_features_df, featurenames) + random_forest_regression = PredictMD.singlelabeldataframerandom_forest_regressionression( featurenames, labelname; diff --git a/examples/breast_cancer_biopsy/OLD_breast_cancer_biopsy.jl b/examples/breast_cancer_biopsy/OLD_breast_cancer_biopsy.jl index e02da4cd7..e69de29bb 100644 --- a/examples/breast_cancer_biopsy/OLD_breast_cancer_biopsy.jl +++ b/examples/breast_cancer_biopsy/OLD_breast_cancer_biopsy.jl @@ -1,1709 +0,0 @@ -ENV["logisticclassifier_filename"] = string(tempname(), "_logisticclassifier.jld2") -ENV["rfclassifier_filename"] = string(tempname(), "_rfclassifier.jld2") -ENV["csvc_svmclassifier_filename"] = string(tempname(), "_csvc_svmclassifier.jld2") -ENV["nusvc_svmclassifier_filename"] = string(tempname(), "_nusvc_svmclassifier.jld2") -ENV["knetmlp_filename"] = string(tempname(), "_knetmlpclassifier.jld2") - -Base.Test.@test(!isfile(ENV["logisticclassifier_filename"])) -Base.Test.@test(!isfile(ENV["rfclassifier_filename"])) -Base.Test.@test(!isfile(ENV["csvc_svmclassifier_filename"])) -Base.Test.@test(!isfile(ENV["nusvc_svmclassifier_filename"])) -Base.Test.@test(!isfile(ENV["knetmlp_filename"])) - -ENV["LOADTRAINEDMODELSFROMFILE"] = "false" -ENV["SAVETRAINEDMODELSTOFILE"] = "true" - -logisticclassifier_filename = ENV["logisticclassifier_filename"] -rfclassifier_filename = ENV["rfclassifier_filename"] -csvc_svmclassifier_filename = ENV["csvc_svmclassifier_filename"] -nusvc_svmclassifier_filename = ENV["nusvc_svmclassifier_filename"] -knetmlp_filename = ENV["knetmlp_filename"] - -############################################################################## -############################################################################## -### Section 1: Setup ######################################################### -############################################################################## -############################################################################## - -# import required packages -import PredictMD -import DataFrames -import Knet -import LIBSVM -import RDatasets -import StatsBase - -# set the seed of the global random number generator -# this makes the results reproducible -srand(999) - -############################################################################## -############################################################################## -### Section 2: Prepare data ################################################## -############################################################################## -############################################################################## - -# Import breast cancer biopsy data -df = RDatasets.dataset("MASS", "biopsy") - -# Remove rows with missing data -DataFrames.dropmissing!(df) - -# Shuffle rows -PredictMD.shuffle_rows!(df) - -# Define features -categoricalfeaturenames = Symbol[] -continuousfeaturenames = Symbol[ - :V1, - :V2, - :V3, - :V4, - :V5, - :V6, - :V7, - :V8, - :V9, - ] -featurenames = vcat(categoricalfeaturenames, continuousfeaturenames) - -if get(ENV, "LOADTRAINEDMODELSFROMFILE", "") == "true" -else - feature_contrasts = PredictMD.generate_feature_contrasts(df, featurenames) -end - -# Define labels -labelname = :Class -negativeclass = "benign" -positiveclass = "malignant" -labellevels = [negativeclass, positiveclass] - -# Put features and labels in separate dataframes -features_df = df[featurenames] -labels_df = df[[labelname]] - -# Split the data into training (50%), validation (25%), and testing (25%) -trainingandvalidation_features_df, - trainingandvalidation_labels_df, - testing_features_df, - testing_labels_df = PredictMD.split_data( - features_df, - labels_df, - 0.75, # 75% training+validation, 25% testing - ) -training_features_df, - training_labels_df, - validation_features_df, - validation_labels_df = PredictMD.split_data( - trainingandvalidation_features_df, - trainingandvalidation_labels_df, - 2/3, # 2/3 of 75% = 50% training, 1/3 of 75% = 25% validation - ) - -############################################################################## -############################################################################## -### Section 3: Apply the SMOTE algorithm to the training set ################# -############################################################################## -############################################################################## - -# Examine prevalence of each class in training set -# DataFrames.describe(training_labels_df[labelname]) -StatsBase.countmap(training_labels_df[labelname]) - -# We see that malignant is minority class and benign is majority class. -# The ratio of malignant:benign is somewhere between 1:2.5 and 1:3 (depending -# on random seed). We would like that ratio to be 1:1. We will use SMOTE -# to generate synthetic minority class samples. We will also undersample the -# minority class. The result will be a balanced training set. -majorityclass = "benign" -minorityclass = "malignant" - -smoted_training_features_df, smoted_training_labels_df = PredictMD.smote( - training_features_df, - training_labels_df, - featurenames, - labelname; - majorityclass = majorityclass, - minorityclass = minorityclass, - pct_over = 100, # how much to oversample the minority class - minority_to_majority_ratio = 1.0, # desired minority:majority ratio - k = 5, - ) - -# Examine prevalence of each class in smoted training set -# DataFrames.describe(smoted_training_labels_df[labelname]) -StatsBase.countmap(smoted_training_labels_df[labelname]) - -# Now we have a ratio of malignant:benign that is 1:1. - -############################################################################## -############################################################################## -### Section 4: Set up and train models ####################################### -############################################################################## -############################################################################## - -############################################################################## -## Logistic "regression" classifier ########################################## -############################################################################## - -if get(ENV, "LOADTRAINEDMODELSFROMFILE", "") == "true" - logisticclassifier = PredictMD.load_model(logisticclassifier_filename) -else - # Set up logistic classifier model - logisticclassifier = PredictMD.singlelabelbinaryclassdataframelogisticclassifier( - featurenames, - labelname, - labellevels; - package = :GLMjl, - intercept = true, # optional, defaults to true - interactions = 1, # optional, defaults to 1 - name = "Logistic regression", # optional - ) -end - -if get(ENV, "LOADTRAINEDMODELSFROMFILE", "") == "true" -else - # Train logistic classifier model on smoted training set - PredictMD.fit!( - logisticclassifier, - smoted_training_features_df, - smoted_training_labels_df, - ) -end - -# View coefficients, p values, etc. for underlying logistic regression -PredictMD.get_underlying(logisticclassifier) - -# Plot classifier histogram for logistic classifier on smoted training set -logistic_hist_training = PredictMD.plotsinglelabelbinaryclassifierhistogram( - logisticclassifier, - smoted_training_features_df, - smoted_training_labels_df, - labelname, - labellevels, - ) -PredictMD.open_plot(logistic_hist_training) - -# Plot classifier histogram for logistic classifier on testing set -logistic_hist_testing = PredictMD.plotsinglelabelbinaryclassifierhistogram( - logisticclassifier, - testing_features_df, - testing_labels_df, - labelname, - labellevels, - ) -PredictMD.open_plot(logistic_hist_testing) - -# Evaluate performance of logistic classifier on smoted training set -PredictMD.singlelabelbinaryclassificationmetrics( - logisticclassifier, - testing_features_df, - testing_labels_df, - labelname, - positiveclass; - sensitivity = 0.95, - ) - -# Evaluate performance of logistic classifier on testing set -PredictMD.singlelabelbinaryclassificationmetrics( - logisticclassifier, - testing_features_df, - testing_labels_df, - labelname, - positiveclass; - sensitivity = 0.95, - ) - -logistic_calibration_curve = PredictMD.plot_probability_calibration_curve( - logisticclassifier, - smoted_training_features_df, - smoted_training_labels_df, - labelname, - positiveclass; - window = 0.2, - ) -PredictMD.open_plot(logistic_calibration_curve) - -PredictMD.probability_calibration_metrics( - logisticclassifier, - testing_features_df, - testing_labels_df, - labelname, - positiveclass; - window = 0.1, - ) - -logistic_cutoffs, logistic_risk_group_prevalences = PredictMD.risk_score_cutoff_values( - logisticclassifier, - testing_features_df, - testing_labels_df, - labelname, - positiveclass; - average_function = mean, - ) -println( - string( - "Low risk: 0 to $(logistic_cutoffs[1]).", - " Medium risk: $(logistic_cutoffs[1]) to $(logistic_cutoffs[2]).", - " High risk: $(logistic_cutoffs[2]) to 1.", - ) - ) -showall(logistic_risk_group_prevalences) -logistic_cutoffs, logistic_risk_group_prevalences = PredictMD.risk_score_cutoff_values( - logisticclassifier, - testing_features_df, - testing_labels_df, - labelname, - positiveclass; - average_function = median, - ) -println( - string( - "Low risk: 0 to $(logistic_cutoffs[1]).", - " Medium risk: $(logistic_cutoffs[1]) to $(logistic_cutoffs[2]).", - " High risk: $(logistic_cutoffs[2]) to 1.", - ) - ) -showall(logistic_risk_group_prevalences) - -############################################################################## -## Random forest classifier ################################################## -############################################################################## - -if get(ENV, "LOADTRAINEDMODELSFROMFILE", "") == "true" - rfclassifier = PredictMD.load_model(rfclassifier_filename) -else - # Set up random forest classifier model - rfclassifier = PredictMD.singlelabelmulticlassdataframerandomforestclassifier( - featurenames, - labelname, - labellevels; - nsubfeatures = 4, # number of subfeatures; defaults to 2 - ntrees = 200, # number of trees; defaults to 10 - package = :DecisionTreejl, - name = "Random forest", # optional - feature_contrasts = feature_contrasts, - ) -end - -if get(ENV, "LOADTRAINEDMODELSFROMFILE", "") == "true" -else - # Train random forest classifier model on smoted training set - PredictMD.fit!( - rfclassifier, - smoted_training_features_df, - smoted_training_labels_df, - ) -end - -# Plot classifier histogram for random forest classifier on smoted training set -rfclassifier_hist_training = PredictMD.plotsinglelabelbinaryclassifierhistogram( - rfclassifier, - smoted_training_features_df, - smoted_training_labels_df, - labelname, - labellevels, - ) -PredictMD.open_plot(rfclassifier_hist_training) - -# Plot classifier histogram for random forest classifier on testing set -rfclassifier_hist_testing = PredictMD.plotsinglelabelbinaryclassifierhistogram( - rfclassifier, - testing_features_df, - testing_labels_df, - labelname, - labellevels, - ) -PredictMD.open_plot(rfclassifier_hist_testing) - -# Evaluate performance of random forest classifier on smoted training set -PredictMD.singlelabelbinaryclassificationmetrics( - rfclassifier, - smoted_training_features_df, - smoted_training_labels_df, - labelname, - positiveclass; - sensitivity = 0.95, - ) - -# Evaluate performance of random forest on testing set -PredictMD.singlelabelbinaryclassificationmetrics( - rfclassifier, - testing_features_df, - testing_labels_df, - labelname, - positiveclass; - sensitivity = 0.95, - ) - -rf_calibration_curve = PredictMD.plot_probability_calibration_curve( - rfclassifier, - testing_features_df, - testing_labels_df, - labelname, - positiveclass; - window = 0.1, - ) -PredictMD.open_plot(rf_calibration_curve) - -############################################################################## -## Support vector machine (C support vector classifier) ###################### -############################################################################## - -if get(ENV, "LOADTRAINEDMODELSFROMFILE", "") == "true" - csvc_svmclassifier = PredictMD.load_model(csvc_svmclassifier_filename) -else - # Set up C-SVC model - csvc_svmclassifier = PredictMD.singlelabelmulticlassdataframesvmclassifier( - featurenames, - labelname, - labellevels; - package = :LIBSVMjl, - svmtype = LIBSVM.SVC, - name = "SVM (C-SVC)", - verbose = false, - feature_contrasts = feature_contrasts, - ) -end - -if get(ENV, "LOADTRAINEDMODELSFROMFILE", "") == "true" -else - # Train C-SVC model on smoted training set - PredictMD.fit!( - csvc_svmclassifier, - smoted_training_features_df, - smoted_training_labels_df, - ) -end - -# Plot classifier histogram for C-SVC on smoted training set -csvc_svmclassifier_hist_training = PredictMD.plotsinglelabelbinaryclassifierhistogram( - csvc_svmclassifier, - smoted_training_features_df, - smoted_training_labels_df, - labelname, - labellevels, - ) -PredictMD.open_plot(csvc_svmclassifier_hist_training) - -# Plot classifier histogram for C-SVC on testing set -csvc_svmclassifier_hist_testing = PredictMD.plotsinglelabelbinaryclassifierhistogram( - csvc_svmclassifier, - testing_features_df, - testing_labels_df, - labelname, - labellevels, - ) -PredictMD.open_plot(csvc_svmclassifier_hist_testing) - -# Evaluate performance of C-SVC on smoted training set -PredictMD.singlelabelbinaryclassificationmetrics( - csvc_svmclassifier, - smoted_training_features_df, - smoted_training_labels_df, - labelname, - positiveclass; - sensitivity = 0.95, - ) - -# Evaluate performance of C-SVC on testing set -PredictMD.singlelabelbinaryclassificationmetrics( - csvc_svmclassifier, - testing_features_df, - testing_labels_df, - labelname, - positiveclass; - sensitivity = 0.95, - ) - -############################################################################## -## Support vector machine (nu support vector classifier) ##################### -############################################################################## - -if get(ENV, "LOADTRAINEDMODELSFROMFILE", "") == "true" - nusvc_svmclassifier = PredictMD.load_model(nusvc_svmclassifier_filename) -else - # Set up nu-SVC model - nusvc_svmclassifier = PredictMD.singlelabelmulticlassdataframesvmclassifier( - featurenames, - labelname, - labellevels; - package = :LIBSVMjl, - svmtype = LIBSVM.NuSVC, - name = "SVM (nu-SVC)", - verbose = false, - feature_contrasts = feature_contrasts, - ) -end - -if get(ENV, "LOADTRAINEDMODELSFROMFILE", "") == "true" -else - # Train nu-SVC model on smoted training set - PredictMD.fit!( - nusvc_svmclassifier, - smoted_training_features_df, - smoted_training_labels_df, - ) -end - -# Plot classifier histogram for nu-SVC on smoted training set -nusvc_svmclassifier_hist_training = PredictMD.plotsinglelabelbinaryclassifierhistogram( - nusvc_svmclassifier, - smoted_training_features_df, - smoted_training_labels_df, - labelname, - labellevels, - ) -PredictMD.open_plot(nusvc_svmclassifier_hist_training) - -# Plot classifier histogram for nu-SVC on testing set -nusvc_svmclassifier_hist_testing = PredictMD.plotsinglelabelbinaryclassifierhistogram( - nusvc_svmclassifier, - testing_features_df, - testing_labels_df, - labelname, - labellevels, - ) -PredictMD.open_plot(nusvc_svmclassifier_hist_testing) - -# Evaluate performance of nu-SVC on smoted training set -PredictMD.singlelabelbinaryclassificationmetrics( - nusvc_svmclassifier, - smoted_training_features_df, - smoted_training_labels_df, - labelname, - positiveclass; - sensitivity = 0.95, - ) - -# Evaluate performance of SVM on testing set -PredictMD.singlelabelbinaryclassificationmetrics( - nusvc_svmclassifier, - testing_features_df, - testing_labels_df, - labelname, - positiveclass; - sensitivity = 0.95, - ) - -############################################################################## -## Multilayer perceptron (i.e. fully connected feedforward neural network) ### -############################################################################## - -# Define predict function -function knetmlp_predict( - w, # don't put a type annotation on this - x0::AbstractArray; - probabilities::Bool = true, - ) - # x0 = input layer - # x1 = first hidden layer - x1 = Knet.relu.( w[1]*x0 .+ w[2] ) # w[1] = weights, w[2] = biases - # x2 = second hidden layer - x2 = Knet.relu.( w[3]*x1 .+ w[4] ) # w[3] = weights, w[4] = biases - # x3 = output layer - x3 = w[5]*x2 .+ w[6] # w[5] = weights, w[6] = biases - unnormalizedlogprobs = x3 - if probabilities - normalizedlogprobs = Knet.logp(unnormalizedlogprobs, 1) - normalizedprobs = exp.(normalizedlogprobs) - return normalizedprobs - else - return unnormalizedlogprobs - end -end - -# Define loss function -function knetmlp_loss( - predict::Function, - modelweights, # don't put a type annotation on this - x::AbstractArray, - ytrue::AbstractArray; - L1::Real = Cfloat(0), - L2::Real = Cfloat(0), - ) - loss = Knet.nll( - predict( - modelweights, - x; - probabilities = false, - ), - ytrue, - 1, # d = 1 means that instances are in columns - ) - if L1 != 0 - loss += L1 * sum(sum(abs, w_i) for w_i in modelweights[1:2:end]) - end - if L2 != 0 - loss += L2 * sum(sum(abs2, w_i) for w_i in modelweights[1:2:end]) - end - return loss -end - -if get(ENV, "LOADTRAINEDMODELSFROMFILE", "") == "true" - knetmlpclassifier = PredictMD.load_model(knetmlp_filename) -else - # Randomly initialize model weights - knetmlp_modelweights = Any[ - # input layer has dimension contrasts.num_array_columns - # - # first hidden layer (64 neurons): - Cfloat.( - 0.1f0*randn(Cfloat,64,feature_contrasts.num_array_columns) # weights - ), - Cfloat.( - zeros(Cfloat,64,1) # biases - ), - # - # second hidden layer (32 neurons): - Cfloat.( - 0.1f0*randn(Cfloat,32,64) # weights - ), - Cfloat.( - zeros(Cfloat,32,1) # biases - ), - # - # output layer (number of neurons == number of classes): - Cfloat.( - 0.1f0*randn(Cfloat,2,32) # weights - ), - Cfloat.( - zeros(Cfloat,2,1) # biases - ), - ] - # Define loss hyperparameters - knetmlp_losshyperparameters = Dict() - knetmlp_losshyperparameters[:L1] = Cfloat(0.0) - knetmlp_losshyperparameters[:L2] = Cfloat(0.0) - # Select optimization algorithm - knetmlp_optimizationalgorithm = :Momentum - # Set optimization hyperparameters - knetmlp_optimizerhyperparameters = Dict() - # Set the minibatch size - knetmlp_minibatchsize = 48 - # Set the max number of epochs. After training, look at the learning curve. If - # it looks like the model has not yet converged, raise maxepochs. If it looks - # like the loss has hit a plateau and you are worried about overfitting, lower - # maxepochs. - knetmlp_maxepochs = 1_000 - # Set up multilayer perceptron model - knetmlpclassifier = PredictMD.singlelabelmulticlassdataframeknetclassifier( - featurenames, - labelname, - labellevels; - package = :Knetjl, - name = "Knet MLP", - predict = knetmlp_predict, - loss = knetmlp_loss, - losshyperparameters = knetmlp_losshyperparameters, - optimizationalgorithm = knetmlp_optimizationalgorithm, - optimizerhyperparameters = knetmlp_optimizerhyperparameters, - minibatchsize = knetmlp_minibatchsize, - modelweights = knetmlp_modelweights, - printlosseverynepochs = 100, # if 0, will not print at all - maxepochs = knetmlp_maxepochs, - feature_contrasts = feature_contrasts, - ) -end - -if get(ENV, "LOADTRAINEDMODELSFROMFILE", "") == "true" -else - # Train multilayer perceptron model on training set - PredictMD.fit!( - knetmlpclassifier, - smoted_training_features_df, - smoted_training_labels_df, - validation_features_df, - validation_labels_df, - ) -end - -# Plot learning curve: loss vs. epoch -knet_learningcurve_lossvsepoch = PredictMD.plotlearningcurve( - knetmlpclassifier, - :loss_vs_epoch; - ) -PredictMD.open_plot(knet_learningcurve_lossvsepoch) - -# Plot learning curve: loss vs. epoch, skip the first 10 epochs -knet_learningcurve_lossvsepoch_skip10epochs = PredictMD.plotlearningcurve( - knetmlpclassifier, - :loss_vs_epoch; - startat = 10, - endat = :end, - ) -PredictMD.open_plot(knet_learningcurve_lossvsepoch_skip10epochs) - -# Plot learning curve: loss vs. iteration -knet_learningcurve_lossvsiteration = PredictMD.plotlearningcurve( - knetmlpclassifier, - :loss_vs_iteration; - window = 50, - sampleevery = 10, - ) -PredictMD.open_plot(knet_learningcurve_lossvsiteration) - -# Plot learning curve: loss vs. iteration, skip the first 100 iterations -knet_learningcurve_lossvsiteration_skip100iterations = PredictMD.plotlearningcurve( - knetmlpclassifier, - :loss_vs_iteration; - window = 50, - sampleevery = 10, - startat = 100, - endat = :end, - ) -PredictMD.open_plot(knet_learningcurve_lossvsiteration_skip100iterations) - -# Plot classifier histogram for multilayer perceptron on smoted training set -knetmlpclassifier_hist_training = PredictMD.plotsinglelabelbinaryclassifierhistogram( - knetmlpclassifier, - smoted_training_features_df, - smoted_training_labels_df, - labelname, - labellevels, - ) -PredictMD.open_plot(knetmlpclassifier_hist_training) - -# Plot classifier histogram for multilayer perceptron on testing set -knetmlpclassifier_hist_testing = PredictMD.plotsinglelabelbinaryclassifierhistogram( - knetmlpclassifier, - testing_features_df, - testing_labels_df, - labelname, - labellevels, - ) -PredictMD.open_plot(knetmlpclassifier_hist_testing) - -# Evaluate performance of multilayer perceptron on smoted training set -PredictMD.singlelabelbinaryclassificationmetrics( - knetmlpclassifier, - smoted_training_features_df, - smoted_training_labels_df, - labelname, - positiveclass; - sensitivity = 0.95, - ) - -# Evaluate performance of multilayer perceptron on testing set -PredictMD.singlelabelbinaryclassificationmetrics( - knetmlpclassifier, - testing_features_df, - testing_labels_df, - labelname, - positiveclass; - sensitivity = 0.95, - ) - -############################################################################## -############################################################################## -## Section 5: Compare performance of all models ############################## -############################################################################## -############################################################################## - -all_models = PredictMD.Fittable[ - logisticclassifier, - rfclassifier, - csvc_svmclassifier, - nusvc_svmclassifier, - knetmlpclassifier, - ] - -# Compare performance of all models on smoted training set -showall(PredictMD.singlelabelbinaryclassificationmetrics( - all_models, - training_features_df, - training_labels_df, - labelname, - positiveclass; - sensitivity = 0.95, - )) -showall(PredictMD.singlelabelbinaryclassificationmetrics( - all_models, - training_features_df, - training_labels_df, - labelname, - positiveclass; - specificity = 0.95, - )) -showall(PredictMD.singlelabelbinaryclassificationmetrics( - all_models, - training_features_df, - training_labels_df, - labelname, - positiveclass; - maximize = :f1score, - )) -showall(PredictMD.singlelabelbinaryclassificationmetrics( - all_models, - training_features_df, - training_labels_df, - labelname, - positiveclass; - maximize = :cohen_kappa, - )) - -# Compare performance of all models on testing set -showall(PredictMD.singlelabelbinaryclassificationmetrics( - all_models, - testing_features_df, - testing_labels_df, - labelname, - positiveclass; - sensitivity = 0.95, - )) -showall(PredictMD.singlelabelbinaryclassificationmetrics( - all_models, - testing_features_df, - testing_labels_df, - labelname, - positiveclass; - specificity = 0.95, - )) -showall(PredictMD.singlelabelbinaryclassificationmetrics( - all_models, - testing_features_df, - testing_labels_df, - labelname, - positiveclass; - maximize = :f1score, - )) -showall(PredictMD.singlelabelbinaryclassificationmetrics( - all_models, - testing_features_df, - testing_labels_df, - labelname, - positiveclass; - maximize = :cohen_kappa, - )) - -# Plot receiver operating characteristic curves for all models on testing set. -rocplottesting = PredictMD.plotroccurves( - all_models, - testing_features_df, - testing_labels_df, - labelname, - positiveclass, - ) -PredictMD.open_plot(rocplottesting) - -# Plot precision-recall curves for all models on testing set. -prplottesting = PredictMD.plotprcurves( - all_models, - testing_features_df, - testing_labels_df, - labelname, - positiveclass, - ) -PredictMD.open_plot(prplottesting) - -############################################################################## -############################################################################## -### Section 6: Save trained models to file (if desired) ####################### -############################################################################## -############################################################################## - -if get(ENV, "SAVETRAINEDMODELSTOFILE", "") == "true" - PredictMD.save_model(logisticclassifier_filename, logisticclassifier) - PredictMD.save_model(rfclassifier_filename, rfclassifier) - PredictMD.save_model(csvc_svmclassifier_filename, csvc_svmclassifier) - PredictMD.save_model(nusvc_svmclassifier_filename, nusvc_svmclassifier) - PredictMD.save_model(knetmlp_filename, knetmlpclassifier) -end - -############################################################################## -############################################################################## -## Appendix A: Directly access the output of classification models ########### -############################################################################## -############################################################################## - -# We can use the PredictMD.predict_proba() function to get the probabilities output -# by each of the classification models. - -# Get probabilities from each model for smoted training set -PredictMD.predict_proba(logisticclassifier,smoted_training_features_df,) -PredictMD.predict_proba(rfclassifier,smoted_training_features_df,) -PredictMD.predict_proba(csvc_svmclassifier,smoted_training_features_df,) -PredictMD.predict_proba(nusvc_svmclassifier,smoted_training_features_df,) -PredictMD.predict_proba(knetmlpclassifier,smoted_training_features_df,) - -# Get probabilities from each model for testing set -PredictMD.predict_proba(logisticclassifier,testing_features_df,) -PredictMD.predict_proba(rfclassifier,testing_features_df,) -PredictMD.predict_proba(csvc_svmclassifier,testing_features_df,) -PredictMD.predict_proba(nusvc_svmclassifier,testing_features_df,) -PredictMD.predict_proba(knetmlpclassifier,testing_features_df,) - -# If we want to get predicted classes instead of probabilities, we can use the -# PredictMD.predict() function to get the class predictions output by each of the -# classification models. For each sample, PredictMD.predict() will select the class -# with the highest probability. In the case of binary classification, this is -# equivalent to using a threshold of 0.5. - -# Get class predictions from each model for smoted training set -PredictMD.predict(logisticclassifier,smoted_training_features_df,) -PredictMD.predict(rfclassifier,smoted_training_features_df,) -PredictMD.predict(csvc_svmclassifier,smoted_training_features_df,) -PredictMD.predict(nusvc_svmclassifier,smoted_training_features_df,) -PredictMD.predict(knetmlpclassifier,smoted_training_features_df,) - -# Get class predictions from each model for testing set -PredictMD.predict(logisticclassifier,testing_features_df,) -PredictMD.predict(rfclassifier,testing_features_df,) -PredictMD.predict(csvc_svmclassifier,testing_features_df,) -PredictMD.predict(nusvc_svmclassifier,testing_features_df,) -PredictMD.predict(knetmlpclassifier,testing_features_df,) - -Base.Test.@test(isfile(ENV["logisticclassifier_filename"])) -Base.Test.@test(isfile(ENV["rfclassifier_filename"])) -Base.Test.@test(isfile(ENV["csvc_svmclassifier_filename"])) -Base.Test.@test(isfile(ENV["nusvc_svmclassifier_filename"])) -Base.Test.@test(isfile(ENV["knetmlp_filename"])) - -ENV["LOADTRAINEDMODELSFROMFILE"] = "true" -ENV["SAVETRAINEDMODELSTOFILE"] = "false" - -logisticclassifier_filename = ENV["logisticclassifier_filename"] -rfclassifier_filename = ENV["rfclassifier_filename"] -csvc_svmclassifier_filename = ENV["csvc_svmclassifier_filename"] -nusvc_svmclassifier_filename = ENV["nusvc_svmclassifier_filename"] -knetmlp_filename = ENV["knetmlp_filename"] - -############################################################################## -############################################################################## -### Section 1: Setup ######################################################### -############################################################################## -############################################################################## - -# import required packages -import PredictMD -import DataFrames -import Knet -import LIBSVM -import RDatasets -import StatsBase - -# set the seed of the global random number generator -# this makes the results reproducible -srand(999) - -############################################################################## -############################################################################## -### Section 2: Prepare data ################################################## -############################################################################## -############################################################################## - -# Import breast cancer biopsy data -df = RDatasets.dataset("MASS", "biopsy") - -# Remove rows with missing data -DataFrames.dropmissing!(df) - -# Shuffle rows -PredictMD.shuffle_rows!(df) - -# Define features -categoricalfeaturenames = Symbol[] -continuousfeaturenames = Symbol[ - :V1, - :V2, - :V3, - :V4, - :V5, - :V6, - :V7, - :V8, - :V9, - ] -featurenames = vcat(categoricalfeaturenames, continuousfeaturenames) - -if get(ENV, "LOADTRAINEDMODELSFROMFILE", "") == "true" -else - feature_contrasts = PredictMD.generate_feature_contrasts(df, featurenames) -end - -# Define labels -labelname = :Class -negativeclass = "benign" -positiveclass = "malignant" -labellevels = [negativeclass, positiveclass] - -# Put features and labels in separate dataframes -features_df = df[featurenames] -labels_df = df[[labelname]] - -# Split the data into training (50%), validation (25%), and testing (25%) -trainingandvalidation_features_df, - trainingandvalidation_labels_df, - testing_features_df, - testing_labels_df = PredictMD.split_data( - features_df, - labels_df, - 0.75, # 75% training+validation, 25% testing - ) -training_features_df, - training_labels_df, - validation_features_df, - validation_labels_df = PredictMD.split_data( - trainingandvalidation_features_df, - trainingandvalidation_labels_df, - 2/3, # 2/3 of 75% = 50% training, 1/3 of 75% = 25% validation - ) - -############################################################################## -############################################################################## -### Section 3: Apply the SMOTE algorithm to the training set ################# -############################################################################## -############################################################################## - -# Examine prevalence of each class in training set -# DataFrames.describe(training_labels_df[labelname]) -StatsBase.countmap(training_labels_df[labelname]) - -# We see that malignant is minority class and benign is majority class. -# The ratio of malignant:benign is somewhere between 1:2.5 and 1:3 (depending -# on random seed). We would like that ratio to be 1:1. We will use SMOTE -# to generate synthetic minority class samples. We will also undersample the -# minority class. The result will be a balanced training set. -majorityclass = "benign" -minorityclass = "malignant" - -smoted_training_features_df, smoted_training_labels_df = PredictMD.smote( - training_features_df, - training_labels_df, - featurenames, - labelname; - majorityclass = majorityclass, - minorityclass = minorityclass, - pct_over = 100, # how much to oversample the minority class - minority_to_majority_ratio = 1.0, # desired minority:majority ratio - k = 5, - ) - -# Examine prevalence of each class in smoted training set -# DataFrames.describe(smoted_training_labels_df[labelname]) -StatsBase.countmap(smoted_training_labels_df[labelname]) - -# Now we have a ratio of malignant:benign that is 1:1. - -############################################################################## -############################################################################## -### Section 4: Set up and train models ####################################### -############################################################################## -############################################################################## - -############################################################################## -## Logistic "regression" classifier ########################################## -############################################################################## - -if get(ENV, "LOADTRAINEDMODELSFROMFILE", "") == "true" - logisticclassifier = PredictMD.load_model(logisticclassifier_filename) -else - # Set up logistic classifier model - logisticclassifier = PredictMD.singlelabelbinaryclassdataframelogisticclassifier( - featurenames, - labelname, - labellevels; - package = :GLMjl, - intercept = true, # optional, defaults to true - interactions = 1, # optional, defaults to 1 - name = "Logistic regression", # optional - ) -end - -if get(ENV, "LOADTRAINEDMODELSFROMFILE", "") == "true" -else - # Train logistic classifier model on smoted training set - PredictMD.fit!( - logisticclassifier, - smoted_training_features_df, - smoted_training_labels_df, - ) -end - -# View coefficients, p values, etc. for underlying logistic regression -PredictMD.get_underlying(logisticclassifier) - -# Plot classifier histogram for logistic classifier on smoted training set -logistic_hist_training = PredictMD.plotsinglelabelbinaryclassifierhistogram( - logisticclassifier, - smoted_training_features_df, - smoted_training_labels_df, - labelname, - labellevels, - ) -PredictMD.open_plot(logistic_hist_training) - -# Plot classifier histogram for logistic classifier on testing set -logistic_hist_testing = PredictMD.plotsinglelabelbinaryclassifierhistogram( - logisticclassifier, - testing_features_df, - testing_labels_df, - labelname, - labellevels, - ) -PredictMD.open_plot(logistic_hist_testing) - -# Evaluate performance of logistic classifier on smoted training set -PredictMD.singlelabelbinaryclassificationmetrics( - logisticclassifier, - testing_features_df, - testing_labels_df, - labelname, - positiveclass; - sensitivity = 0.95, - ) - -# Evaluate performance of logistic classifier on testing set -PredictMD.singlelabelbinaryclassificationmetrics( - logisticclassifier, - testing_features_df, - testing_labels_df, - labelname, - positiveclass; - sensitivity = 0.95, - ) - -logistic_calibration_curve = PredictMD.plot_probability_calibration_curve( - logisticclassifier, - smoted_training_features_df, - smoted_training_labels_df, - labelname, - positiveclass; - window = 0.2, - ) -PredictMD.open_plot(logistic_calibration_curve) - -PredictMD.probability_calibration_metrics( - logisticclassifier, - testing_features_df, - testing_labels_df, - labelname, - positiveclass; - window = 0.1, - ) - -logistic_cutoffs, logistic_risk_group_prevalences = PredictMD.risk_score_cutoff_values( - logisticclassifier, - testing_features_df, - testing_labels_df, - labelname, - positiveclass; - average_function = mean, - ) -println( - string( - "Low risk: 0 to $(logistic_cutoffs[1]).", - " Medium risk: $(logistic_cutoffs[1]) to $(logistic_cutoffs[2]).", - " High risk: $(logistic_cutoffs[2]) to 1.", - ) - ) -showall(logistic_risk_group_prevalences) -logistic_cutoffs, logistic_risk_group_prevalences = PredictMD.risk_score_cutoff_values( - logisticclassifier, - testing_features_df, - testing_labels_df, - labelname, - positiveclass; - average_function = median, - ) -println( - string( - "Low risk: 0 to $(logistic_cutoffs[1]).", - " Medium risk: $(logistic_cutoffs[1]) to $(logistic_cutoffs[2]).", - " High risk: $(logistic_cutoffs[2]) to 1.", - ) - ) -showall(logistic_risk_group_prevalences) - -############################################################################## -## Random forest classifier ################################################## -############################################################################## - -if get(ENV, "LOADTRAINEDMODELSFROMFILE", "") == "true" - rfclassifier = PredictMD.load_model(rfclassifier_filename) -else - # Set up random forest classifier model - rfclassifier = PredictMD.singlelabelmulticlassdataframerandomforestclassifier( - featurenames, - labelname, - labellevels; - nsubfeatures = 4, # number of subfeatures; defaults to 2 - ntrees = 200, # number of trees; defaults to 10 - package = :DecisionTreejl, - name = "Random forest", # optional - feature_contrasts = feature_contrasts, - ) -end - -if get(ENV, "LOADTRAINEDMODELSFROMFILE", "") == "true" -else - # Train random forest classifier model on smoted training set - PredictMD.fit!( - rfclassifier, - smoted_training_features_df, - smoted_training_labels_df, - ) -end - -# Plot classifier histogram for random forest classifier on smoted training set -rfclassifier_hist_training = PredictMD.plotsinglelabelbinaryclassifierhistogram( - rfclassifier, - smoted_training_features_df, - smoted_training_labels_df, - labelname, - labellevels, - ) -PredictMD.open_plot(rfclassifier_hist_training) - -# Plot classifier histogram for random forest classifier on testing set -rfclassifier_hist_testing = PredictMD.plotsinglelabelbinaryclassifierhistogram( - rfclassifier, - testing_features_df, - testing_labels_df, - labelname, - labellevels, - ) -PredictMD.open_plot(rfclassifier_hist_testing) - -# Evaluate performance of random forest classifier on smoted training set -PredictMD.singlelabelbinaryclassificationmetrics( - rfclassifier, - smoted_training_features_df, - smoted_training_labels_df, - labelname, - positiveclass; - sensitivity = 0.95, - ) - -# Evaluate performance of random forest on testing set -PredictMD.singlelabelbinaryclassificationmetrics( - rfclassifier, - testing_features_df, - testing_labels_df, - labelname, - positiveclass; - sensitivity = 0.95, - ) - -rf_calibration_curve = PredictMD.plot_probability_calibration_curve( - rfclassifier, - testing_features_df, - testing_labels_df, - labelname, - positiveclass; - window = 0.1, - ) -PredictMD.open_plot(rf_calibration_curve) - -############################################################################## -## Support vector machine (C support vector classifier) ###################### -############################################################################## - -if get(ENV, "LOADTRAINEDMODELSFROMFILE", "") == "true" - csvc_svmclassifier = PredictMD.load_model(csvc_svmclassifier_filename) -else - # Set up C-SVC model - csvc_svmclassifier = PredictMD.singlelabelmulticlassdataframesvmclassifier( - featurenames, - labelname, - labellevels; - package = :LIBSVMjl, - svmtype = LIBSVM.SVC, - name = "SVM (C-SVC)", - verbose = false, - feature_contrasts = feature_contrasts, - ) -end - -if get(ENV, "LOADTRAINEDMODELSFROMFILE", "") == "true" -else - # Train C-SVC model on smoted training set - PredictMD.fit!( - csvc_svmclassifier, - smoted_training_features_df, - smoted_training_labels_df, - ) -end - -# Plot classifier histogram for C-SVC on smoted training set -csvc_svmclassifier_hist_training = PredictMD.plotsinglelabelbinaryclassifierhistogram( - csvc_svmclassifier, - smoted_training_features_df, - smoted_training_labels_df, - labelname, - labellevels, - ) -PredictMD.open_plot(csvc_svmclassifier_hist_training) - -# Plot classifier histogram for C-SVC on testing set -csvc_svmclassifier_hist_testing = PredictMD.plotsinglelabelbinaryclassifierhistogram( - csvc_svmclassifier, - testing_features_df, - testing_labels_df, - labelname, - labellevels, - ) -PredictMD.open_plot(csvc_svmclassifier_hist_testing) - -# Evaluate performance of C-SVC on smoted training set -PredictMD.singlelabelbinaryclassificationmetrics( - csvc_svmclassifier, - smoted_training_features_df, - smoted_training_labels_df, - labelname, - positiveclass; - sensitivity = 0.95, - ) - -# Evaluate performance of C-SVC on testing set -PredictMD.singlelabelbinaryclassificationmetrics( - csvc_svmclassifier, - testing_features_df, - testing_labels_df, - labelname, - positiveclass; - sensitivity = 0.95, - ) - -############################################################################## -## Support vector machine (nu support vector classifier) ##################### -############################################################################## - -if get(ENV, "LOADTRAINEDMODELSFROMFILE", "") == "true" - nusvc_svmclassifier = PredictMD.load_model(nusvc_svmclassifier_filename) -else - # Set up nu-SVC model - nusvc_svmclassifier = PredictMD.singlelabelmulticlassdataframesvmclassifier( - featurenames, - labelname, - labellevels; - package = :LIBSVMjl, - svmtype = LIBSVM.NuSVC, - name = "SVM (nu-SVC)", - verbose = false, - feature_contrasts = feature_contrasts, - ) -end - -if get(ENV, "LOADTRAINEDMODELSFROMFILE", "") == "true" -else - # Train nu-SVC model on smoted training set - PredictMD.fit!( - nusvc_svmclassifier, - smoted_training_features_df, - smoted_training_labels_df, - ) -end - -# Plot classifier histogram for nu-SVC on smoted training set -nusvc_svmclassifier_hist_training = PredictMD.plotsinglelabelbinaryclassifierhistogram( - nusvc_svmclassifier, - smoted_training_features_df, - smoted_training_labels_df, - labelname, - labellevels, - ) -PredictMD.open_plot(nusvc_svmclassifier_hist_training) - -# Plot classifier histogram for nu-SVC on testing set -nusvc_svmclassifier_hist_testing = PredictMD.plotsinglelabelbinaryclassifierhistogram( - nusvc_svmclassifier, - testing_features_df, - testing_labels_df, - labelname, - labellevels, - ) -PredictMD.open_plot(nusvc_svmclassifier_hist_testing) - -# Evaluate performance of nu-SVC on smoted training set -PredictMD.singlelabelbinaryclassificationmetrics( - nusvc_svmclassifier, - smoted_training_features_df, - smoted_training_labels_df, - labelname, - positiveclass; - sensitivity = 0.95, - ) - -# Evaluate performance of SVM on testing set -PredictMD.singlelabelbinaryclassificationmetrics( - nusvc_svmclassifier, - testing_features_df, - testing_labels_df, - labelname, - positiveclass; - sensitivity = 0.95, - ) - -############################################################################## -## Multilayer perceptron (i.e. fully connected feedforward neural network) ### -############################################################################## - -# Define predict function -function knetmlp_predict( - w, # don't put a type annotation on this - x0::AbstractArray; - probabilities::Bool = true, - ) - # x0 = input layer - # x1 = first hidden layer - x1 = Knet.relu.( w[1]*x0 .+ w[2] ) # w[1] = weights, w[2] = biases - # x2 = second hidden layer - x2 = Knet.relu.( w[3]*x1 .+ w[4] ) # w[3] = weights, w[4] = biases - # x3 = output layer - x3 = w[5]*x2 .+ w[6] # w[5] = weights, w[6] = biases - unnormalizedlogprobs = x3 - if probabilities - normalizedlogprobs = Knet.logp(unnormalizedlogprobs, 1) - normalizedprobs = exp.(normalizedlogprobs) - return normalizedprobs - else - return unnormalizedlogprobs - end -end - -# Define loss function -function knetmlp_loss( - predict::Function, - modelweights, # don't put a type annotation on this - x::AbstractArray, - ytrue::AbstractArray; - L1::Real = Cfloat(0), - L2::Real = Cfloat(0), - ) - loss = Knet.nll( - predict( - modelweights, - x; - probabilities = false, - ), - ytrue, - 1, # d = 1 means that instances are in columns - ) - if L1 != 0 - loss += L1 * sum(sum(abs, w_i) for w_i in modelweights[1:2:end]) - end - if L2 != 0 - loss += L2 * sum(sum(abs2, w_i) for w_i in modelweights[1:2:end]) - end - return loss -end - -if get(ENV, "LOADTRAINEDMODELSFROMFILE", "") == "true" - knetmlpclassifier = PredictMD.load_model(knetmlp_filename) -else - # Randomly initialize model weights - knetmlp_modelweights = Any[ - # input layer has dimension contrasts.num_array_columns - # - # first hidden layer (64 neurons): - Cfloat.( - 0.1f0*randn(Cfloat,64,feature_contrasts.num_array_columns) # weights - ), - Cfloat.( - zeros(Cfloat,64,1) # biases - ), - # - # second hidden layer (32 neurons): - Cfloat.( - 0.1f0*randn(Cfloat,32,64) # weights - ), - Cfloat.( - zeros(Cfloat,32,1) # biases - ), - # - # output layer (number of neurons == number of classes): - Cfloat.( - 0.1f0*randn(Cfloat,2,32) # weights - ), - Cfloat.( - zeros(Cfloat,2,1) # biases - ), - ] - # Define loss hyperparameters - knetmlp_losshyperparameters = Dict() - knetmlp_losshyperparameters[:L1] = Cfloat(0.0) - knetmlp_losshyperparameters[:L2] = Cfloat(0.0) - # Select optimization algorithm - knetmlp_optimizationalgorithm = :Momentum - # Set optimization hyperparameters - knetmlp_optimizerhyperparameters = Dict() - # Set the minibatch size - knetmlp_minibatchsize = 48 - # Set the max number of epochs. After training, look at the learning curve. If - # it looks like the model has not yet converged, raise maxepochs. If it looks - # like the loss has hit a plateau and you are worried about overfitting, lower - # maxepochs. - knetmlp_maxepochs = 1_000 - # Set up multilayer perceptron model - knetmlpclassifier = PredictMD.singlelabelmulticlassdataframeknetclassifier( - featurenames, - labelname, - labellevels; - package = :Knetjl, - name = "Knet MLP", - predict = knetmlp_predict, - loss = knetmlp_loss, - losshyperparameters = knetmlp_losshyperparameters, - optimizationalgorithm = knetmlp_optimizationalgorithm, - optimizerhyperparameters = knetmlp_optimizerhyperparameters, - minibatchsize = knetmlp_minibatchsize, - modelweights = knetmlp_modelweights, - printlosseverynepochs = 100, # if 0, will not print at all - maxepochs = knetmlp_maxepochs, - feature_contrasts = feature_contrasts, - ) -end - -if get(ENV, "LOADTRAINEDMODELSFROMFILE", "") == "true" -else - # Train multilayer perceptron model on training set - PredictMD.fit!( - knetmlpclassifier, - smoted_training_features_df, - smoted_training_labels_df, - validation_features_df, - validation_labels_df, - ) -end - -# Plot learning curve: loss vs. epoch -knet_learningcurve_lossvsepoch = PredictMD.plotlearningcurve( - knetmlpclassifier, - :loss_vs_epoch; - ) -PredictMD.open_plot(knet_learningcurve_lossvsepoch) - -# Plot learning curve: loss vs. epoch, skip the first 10 epochs -knet_learningcurve_lossvsepoch_skip10epochs = PredictMD.plotlearningcurve( - knetmlpclassifier, - :loss_vs_epoch; - startat = 10, - endat = :end, - ) -PredictMD.open_plot(knet_learningcurve_lossvsepoch_skip10epochs) - -# Plot learning curve: loss vs. iteration -knet_learningcurve_lossvsiteration = PredictMD.plotlearningcurve( - knetmlpclassifier, - :loss_vs_iteration; - window = 50, - sampleevery = 10, - ) -PredictMD.open_plot(knet_learningcurve_lossvsiteration) - -# Plot learning curve: loss vs. iteration, skip the first 100 iterations -knet_learningcurve_lossvsiteration_skip100iterations = PredictMD.plotlearningcurve( - knetmlpclassifier, - :loss_vs_iteration; - window = 50, - sampleevery = 10, - startat = 100, - endat = :end, - ) -PredictMD.open_plot(knet_learningcurve_lossvsiteration_skip100iterations) - -# Plot classifier histogram for multilayer perceptron on smoted training set -knetmlpclassifier_hist_training = PredictMD.plotsinglelabelbinaryclassifierhistogram( - knetmlpclassifier, - smoted_training_features_df, - smoted_training_labels_df, - labelname, - labellevels, - ) -PredictMD.open_plot(knetmlpclassifier_hist_training) - -# Plot classifier histogram for multilayer perceptron on testing set -knetmlpclassifier_hist_testing = PredictMD.plotsinglelabelbinaryclassifierhistogram( - knetmlpclassifier, - testing_features_df, - testing_labels_df, - labelname, - labellevels, - ) -PredictMD.open_plot(knetmlpclassifier_hist_testing) - -# Evaluate performance of multilayer perceptron on smoted training set -PredictMD.singlelabelbinaryclassificationmetrics( - knetmlpclassifier, - smoted_training_features_df, - smoted_training_labels_df, - labelname, - positiveclass; - sensitivity = 0.95, - ) - -# Evaluate performance of multilayer perceptron on testing set -PredictMD.singlelabelbinaryclassificationmetrics( - knetmlpclassifier, - testing_features_df, - testing_labels_df, - labelname, - positiveclass; - sensitivity = 0.95, - ) - -############################################################################## -############################################################################## -## Section 5: Compare performance of all models ############################## -############################################################################## -############################################################################## - -all_models = PredictMD.Fittable[ - logisticclassifier, - rfclassifier, - csvc_svmclassifier, - nusvc_svmclassifier, - knetmlpclassifier, - ] - -# Compare performance of all models on smoted training set -showall(PredictMD.singlelabelbinaryclassificationmetrics( - all_models, - training_features_df, - training_labels_df, - labelname, - positiveclass; - sensitivity = 0.95, - )) -showall(PredictMD.singlelabelbinaryclassificationmetrics( - all_models, - training_features_df, - training_labels_df, - labelname, - positiveclass; - specificity = 0.95, - )) -showall(PredictMD.singlelabelbinaryclassificationmetrics( - all_models, - training_features_df, - training_labels_df, - labelname, - positiveclass; - maximize = :f1score, - )) -showall(PredictMD.singlelabelbinaryclassificationmetrics( - all_models, - training_features_df, - training_labels_df, - labelname, - positiveclass; - maximize = :cohen_kappa, - )) - -# Compare performance of all models on testing set -showall(PredictMD.singlelabelbinaryclassificationmetrics( - all_models, - testing_features_df, - testing_labels_df, - labelname, - positiveclass; - sensitivity = 0.95, - )) -showall(PredictMD.singlelabelbinaryclassificationmetrics( - all_models, - testing_features_df, - testing_labels_df, - labelname, - positiveclass; - specificity = 0.95, - )) -showall(PredictMD.singlelabelbinaryclassificationmetrics( - all_models, - testing_features_df, - testing_labels_df, - labelname, - positiveclass; - maximize = :f1score, - )) -showall(PredictMD.singlelabelbinaryclassificationmetrics( - all_models, - testing_features_df, - testing_labels_df, - labelname, - positiveclass; - maximize = :cohen_kappa, - )) - -# Plot receiver operating characteristic curves for all models on testing set. -rocplottesting = PredictMD.plotroccurves( - all_models, - testing_features_df, - testing_labels_df, - labelname, - positiveclass, - ) -PredictMD.open_plot(rocplottesting) - -# Plot precision-recall curves for all models on testing set. -prplottesting = PredictMD.plotprcurves( - all_models, - testing_features_df, - testing_labels_df, - labelname, - positiveclass, - ) -PredictMD.open_plot(prplottesting) - -############################################################################## -############################################################################## -### Section 6: Save trained models to file (if desired) ####################### -############################################################################## -############################################################################## - -if get(ENV, "SAVETRAINEDMODELSTOFILE", "") == "true" - PredictMD.save_model(logisticclassifier_filename, logisticclassifier) - PredictMD.save_model(rfclassifier_filename, rfclassifier) - PredictMD.save_model(csvc_svmclassifier_filename, csvc_svmclassifier) - PredictMD.save_model(nusvc_svmclassifier_filename, nusvc_svmclassifier) - PredictMD.save_model(knetmlp_filename, knetmlpclassifier) -end - -############################################################################## -############################################################################## -## Appendix A: Directly access the output of classification models ########### -############################################################################## -############################################################################## - -# We can use the PredictMD.predict_proba() function to get the probabilities output -# by each of the classification models. - -# Get probabilities from each model for smoted training set -PredictMD.predict_proba(logisticclassifier,smoted_training_features_df,) -PredictMD.predict_proba(rfclassifier,smoted_training_features_df,) -PredictMD.predict_proba(csvc_svmclassifier,smoted_training_features_df,) -PredictMD.predict_proba(nusvc_svmclassifier,smoted_training_features_df,) -PredictMD.predict_proba(knetmlpclassifier,smoted_training_features_df,) - -# Get probabilities from each model for testing set -PredictMD.predict_proba(logisticclassifier,testing_features_df,) -PredictMD.predict_proba(rfclassifier,testing_features_df,) -PredictMD.predict_proba(csvc_svmclassifier,testing_features_df,) -PredictMD.predict_proba(nusvc_svmclassifier,testing_features_df,) -PredictMD.predict_proba(knetmlpclassifier,testing_features_df,) - -# If we want to get predicted classes instead of probabilities, we can use the -# PredictMD.predict() function to get the class predictions output by each of the -# classification models. For each sample, PredictMD.predict() will select the class -# with the highest probability. In the case of binary classification, this is -# equivalent to using a threshold of 0.5. - -# Get class predictions from each model for smoted training set -PredictMD.predict(logisticclassifier,smoted_training_features_df,) -PredictMD.predict(rfclassifier,smoted_training_features_df,) -PredictMD.predict(csvc_svmclassifier,smoted_training_features_df,) -PredictMD.predict(nusvc_svmclassifier,smoted_training_features_df,) -PredictMD.predict(knetmlpclassifier,smoted_training_features_df,) - -# Get class predictions from each model for testing set -PredictMD.predict(logisticclassifier,testing_features_df,) -PredictMD.predict(rfclassifier,testing_features_df,) -PredictMD.predict(csvc_svmclassifier,testing_features_df,) -PredictMD.predict(nusvc_svmclassifier,testing_features_df,) -PredictMD.predict(knetmlpclassifier,testing_features_df,) - -Base.Test.@test(isfile(ENV["logisticclassifier_filename"])) -Base.Test.@test(isfile(ENV["rfclassifier_filename"])) -Base.Test.@test(isfile(ENV["csvc_svmclassifier_filename"])) -Base.Test.@test(isfile(ENV["nusvc_svmclassifier_filename"])) -Base.Test.@test(isfile(ENV["knetmlp_filename"])) diff --git a/examples/breast_cancer_biopsy/c_svc_svm_classifier.jl b/examples/breast_cancer_biopsy/c_svc_svm_classifier.jl new file mode 100644 index 000000000..fc6eb6b27 --- /dev/null +++ b/examples/breast_cancer_biopsy/c_svc_svm_classifier.jl @@ -0,0 +1,129 @@ +srand(999) + +import CSV +import DataFrames +import LIBSVM +import PredictMD + +trainingandvalidation_features_df_filename = + ENV["trainingandvalidation_features_df_filename"] +trainingandvalidation_labels_df_filename = + ENV["trainingandvalidation_labels_df_filename"] +testing_features_df_filename = + ENV["testing_features_df_filename"] +testing_labels_df_filename = + ENV["testing_labels_df_filename"] +training_features_df_filename = + ENV["training_features_df_filename"] +training_labels_df_filename = + ENV["training_labels_df_filename"] +validation_features_df_filename = + ENV["validation_features_df_filename"] +validation_labels_df_filename = + ENV["validation_labels_df_filename"] +trainingandvalidation_features_df = CSV.read( + trainingandvalidation_features_df_filename, + DataFrames.DataFrame, + ) +trainingandvalidation_labels_df = CSV.read( + trainingandvalidation_labels_df_filename, + DataFrames.DataFrame, + ) +testing_features_df = CSV.read( + testing_features_df_filename, + DataFrames.DataFrame, + ) +testing_labels_df = CSV.read( + testing_labels_df_filename, + DataFrames.DataFrame, + ) +training_features_df = CSV.read( + training_features_df_filename, + DataFrames.DataFrame, + ) +training_features_df = CSV.read( + training_features_df_filename, + DataFrames.DataFrame, + ) +validation_features_df = CSV.read( + validation_features_df_filename, + DataFrames.DataFrame, + ) +validation_labels_df = CSV.read( + validation_labels_df_filename, + DataFrames.DataFrame, + ) + +smoted_training_features_df_filename = + ENV["smoted_training_features_df_filename"] +smoted_training_labels_df_filename = + ENV["smoted_training_labels_df_filename"] +smoted_training_features_df = CSV.read( + smoted_training_features_df_filename, + DataFrames.DataFrame, + ) +smoted_training_labels_df = CSV.read( + smoted_training_features_df_filename, + DataFrames.DataFrame, + ) + +ENV["c_svc_svm_classifier_filename"] = string( + tempname(), + "c_svc_svm_classifier.jld2", + ) +c_svc_svm_classifier_filename = ENV["c_svc_svm_classifier_filename"] + +csvc_svmclassifier = PredictMD.singlelabelmulticlassdataframesvmclassifier( + featurenames, + labelname, + labellevels; + package = :LIBSVMjl, + svmtype = LIBSVM.SVC, + name = "SVM (C-SVC)", + verbose = false, + feature_contrasts = feature_contrasts, + ) + +PredictMD.fit!( + csvc_svmclassifier, + smoted_training_features_df, + smoted_training_labels_df, + ) + +csvc_svmclassifier_hist_training = PredictMD.plotsinglelabelbinaryclassifierhistogram( + csvc_svmclassifier, + smoted_training_features_df, + smoted_training_labels_df, + labelname, + labellevels, + ) +PredictMD.open_plot(csvc_svmclassifier_hist_training) + +csvc_svmclassifier_hist_testing = PredictMD.plotsinglelabelbinaryclassifierhistogram( + csvc_svmclassifier, + testing_features_df, + testing_labels_df, + labelname, + labellevels, + ) +PredictMD.open_plot(csvc_svmclassifier_hist_testing) + +PredictMD.singlelabelbinaryclassificationmetrics( + csvc_svmclassifier, + smoted_training_features_df, + smoted_training_labels_df, + labelname, + positiveclass; + sensitivity = 0.95, + ) + +PredictMD.singlelabelbinaryclassificationmetrics( + csvc_svmclassifier, + testing_features_df, + testing_labels_df, + labelname, + positiveclass; + sensitivity = 0.95, + ) + +PredictMD.save_model(csvc_svmclassifier_filename, csvc_svmclassifier) diff --git a/examples/breast_cancer_biopsy/compare_models.jl b/examples/breast_cancer_biopsy/compare_models.jl new file mode 100644 index 000000000..e18144966 --- /dev/null +++ b/examples/breast_cancer_biopsy/compare_models.jl @@ -0,0 +1,202 @@ +srand(999) + +import CSV +import DataFrames +import Knet +import PredictMD + +trainingandvalidation_features_df_filename = + ENV["trainingandvalidation_features_df_filename"] +trainingandvalidation_labels_df_filename = + ENV["trainingandvalidation_labels_df_filename"] +testing_features_df_filename = + ENV["testing_features_df_filename"] +testing_labels_df_filename = + ENV["testing_labels_df_filename"] +training_features_df_filename = + ENV["training_features_df_filename"] +training_labels_df_filename = + ENV["training_labels_df_filename"] +validation_features_df_filename = + ENV["validation_features_df_filename"] +validation_labels_df_filename = + ENV["validation_labels_df_filename"] +trainingandvalidation_features_df = CSV.read( + trainingandvalidation_features_df_filename, + DataFrames.DataFrame, + ) +trainingandvalidation_labels_df = CSV.read( + trainingandvalidation_labels_df_filename, + DataFrames.DataFrame, + ) +testing_features_df = CSV.read( + testing_features_df_filename, + DataFrames.DataFrame, + ) +testing_labels_df = CSV.read( + testing_labels_df_filename, + DataFrames.DataFrame, + ) +training_features_df = CSV.read( + training_features_df_filename, + DataFrames.DataFrame, + ) +training_features_df = CSV.read( + training_features_df_filename, + DataFrames.DataFrame, + ) +validation_features_df = CSV.read( + validation_features_df_filename, + DataFrames.DataFrame, + ) +validation_labels_df = CSV.read( + validation_labels_df_filename, + DataFrames.DataFrame, + ) + +logisticclassifier = PredictMD.load_model(logisticclassifier_filename) +rfclassifier = PredictMD.load_model(rfclassifier_filename) +csvc_svmclassifier = PredictMD.load_model(csvc_svmclassifier_filename) +nusvc_svmclassifier = PredictMD.load_model(nusvc_svmclassifier_filename) + +unction knetmlp_predict( + w, # don't put a type annotation on this + x0::AbstractArray; + probabilities::Bool = true, + ) + # x0 = input layer + # x1 = first hidden layer + x1 = Knet.relu.( w[1]*x0 .+ w[2] ) # w[1] = weights, w[2] = biases + # x2 = second hidden layer + x2 = Knet.relu.( w[3]*x1 .+ w[4] ) # w[3] = weights, w[4] = biases + # x3 = output layer + x3 = w[5]*x2 .+ w[6] # w[5] = weights, w[6] = biases + unnormalizedlogprobs = x3 + if probabilities + normalizedlogprobs = Knet.logp(unnormalizedlogprobs, 1) + normalizedprobs = exp.(normalizedlogprobs) + return normalizedprobs + else + return unnormalizedlogprobs + end +end + +function knetmlp_loss( + predict::Function, + modelweights, # don't put a type annotation on this + x::AbstractArray, + ytrue::AbstractArray; + L1::Real = Cfloat(0), + L2::Real = Cfloat(0), + ) + loss = Knet.nll( + predict( + modelweights, + x; + probabilities = false, + ), + ytrue, + 1, # d = 1 means that instances are in columns + ) + if L1 != 0 + loss += L1 * sum(sum(abs, w_i) for w_i in modelweights[1:2:end]) + end + if L2 != 0 + loss += L2 * sum(sum(abs2, w_i) for w_i in modelweights[1:2:end]) + end + return loss +end + +knetmlpclassifier = PredictMD.load_model(knetmlp_filename) + +all_models = PredictMD.Fittable[ + logisticclassifier, + rfclassifier, + csvc_svmclassifier, + nusvc_svmclassifier, + knetmlpclassifier, + ] + +showall(PredictMD.singlelabelbinaryclassificationmetrics( + all_models, + training_features_df, + training_labels_df, + labelname, + positiveclass; + sensitivity = 0.95, + )) +showall(PredictMD.singlelabelbinaryclassificationmetrics( + all_models, + training_features_df, + training_labels_df, + labelname, + positiveclass; + specificity = 0.95, + )) +showall(PredictMD.singlelabelbinaryclassificationmetrics( + all_models, + training_features_df, + training_labels_df, + labelname, + positiveclass; + maximize = :f1score, + )) +showall(PredictMD.singlelabelbinaryclassificationmetrics( + all_models, + training_features_df, + training_labels_df, + labelname, + positiveclass; + maximize = :cohen_kappa, + )) + +showall(PredictMD.singlelabelbinaryclassificationmetrics( + all_models, + testing_features_df, + testing_labels_df, + labelname, + positiveclass; + sensitivity = 0.95, + )) +showall(PredictMD.singlelabelbinaryclassificationmetrics( + all_models, + testing_features_df, + testing_labels_df, + labelname, + positiveclass; + specificity = 0.95, + )) +showall(PredictMD.singlelabelbinaryclassificationmetrics( + all_models, + testing_features_df, + testing_labels_df, + labelname, + positiveclass; + maximize = :f1score, + )) +showall(PredictMD.singlelabelbinaryclassificationmetrics( + all_models, + testing_features_df, + testing_labels_df, + labelname, + positiveclass; + maximize = :cohen_kappa, + )) + +rocplottesting = PredictMD.plotroccurves( + all_models, + testing_features_df, + testing_labels_df, + labelname, + positiveclass, + ) +PredictMD.open_plot(rocplottesting) + +prplottesting = PredictMD.plotprcurves( + all_models, + testing_features_df, + testing_labels_df, + labelname, + positiveclass, + ) +PredictMD.open_plot(prplottesting) diff --git a/examples/breast_cancer_biopsy/get_model_output.jl b/examples/breast_cancer_biopsy/get_model_output.jl new file mode 100644 index 000000000..4e32699cb --- /dev/null +++ b/examples/breast_cancer_biopsy/get_model_output.jl @@ -0,0 +1,134 @@ +srand(999) + +import CSV +import DataFrames +import Knet +import PredictMD + +trainingandvalidation_features_df_filename = + ENV["trainingandvalidation_features_df_filename"] +trainingandvalidation_labels_df_filename = + ENV["trainingandvalidation_labels_df_filename"] +testing_features_df_filename = + ENV["testing_features_df_filename"] +testing_labels_df_filename = + ENV["testing_labels_df_filename"] +training_features_df_filename = + ENV["training_features_df_filename"] +training_labels_df_filename = + ENV["training_labels_df_filename"] +validation_features_df_filename = + ENV["validation_features_df_filename"] +validation_labels_df_filename = + ENV["validation_labels_df_filename"] +trainingandvalidation_features_df = CSV.read( + trainingandvalidation_features_df_filename, + DataFrames.DataFrame, + ) +trainingandvalidation_labels_df = CSV.read( + trainingandvalidation_labels_df_filename, + DataFrames.DataFrame, + ) +testing_features_df = CSV.read( + testing_features_df_filename, + DataFrames.DataFrame, + ) +testing_labels_df = CSV.read( + testing_labels_df_filename, + DataFrames.DataFrame, + ) +training_features_df = CSV.read( + training_features_df_filename, + DataFrames.DataFrame, + ) +training_features_df = CSV.read( + training_features_df_filename, + DataFrames.DataFrame, + ) +validation_features_df = CSV.read( + validation_features_df_filename, + DataFrames.DataFrame, + ) +validation_labels_df = CSV.read( + validation_labels_df_filename, + DataFrames.DataFrame, + ) + +logisticclassifier = PredictMD.load_model(logisticclassifier_filename) +rfclassifier = PredictMD.load_model(rfclassifier_filename) +csvc_svmclassifier = PredictMD.load_model(csvc_svmclassifier_filename) +nusvc_svmclassifier = PredictMD.load_model(nusvc_svmclassifier_filename) + +function knetmlp_predict( + w, # don't put a type annotation on this + x0::AbstractArray; + probabilities::Bool = true, + ) + # x0 = input layer + # x1 = first hidden layer + x1 = Knet.relu.( w[1]*x0 .+ w[2] ) # w[1] = weights, w[2] = biases + # x2 = second hidden layer + x2 = Knet.relu.( w[3]*x1 .+ w[4] ) # w[3] = weights, w[4] = biases + # x3 = output layer + x3 = w[5]*x2 .+ w[6] # w[5] = weights, w[6] = biases + unnormalizedlogprobs = x3 + if probabilities + normalizedlogprobs = Knet.logp(unnormalizedlogprobs, 1) + normalizedprobs = exp.(normalizedlogprobs) + return normalizedprobs + else + return unnormalizedlogprobs + end +end + +function knetmlp_loss( + predict::Function, + modelweights, # don't put a type annotation on this + x::AbstractArray, + ytrue::AbstractArray; + L1::Real = Cfloat(0), + L2::Real = Cfloat(0), + ) + loss = Knet.nll( + predict( + modelweights, + x; + probabilities = false, + ), + ytrue, + 1, # d = 1 means that instances are in columns + ) + if L1 != 0 + loss += L1 * sum(sum(abs, w_i) for w_i in modelweights[1:2:end]) + end + if L2 != 0 + loss += L2 * sum(sum(abs2, w_i) for w_i in modelweights[1:2:end]) + end + return loss +end + +knetmlpclassifier = PredictMD.load_model(knetmlp_filename) + +PredictMD.predict_proba(logisticclassifier,smoted_training_features_df,) +PredictMD.predict_proba(rfclassifier,smoted_training_features_df,) +PredictMD.predict_proba(csvc_svmclassifier,smoted_training_features_df,) +PredictMD.predict_proba(nusvc_svmclassifier,smoted_training_features_df,) +PredictMD.predict_proba(knetmlpclassifier,smoted_training_features_df,) + +PredictMD.predict_proba(logisticclassifier,testing_features_df,) +PredictMD.predict_proba(rfclassifier,testing_features_df,) +PredictMD.predict_proba(csvc_svmclassifier,testing_features_df,) +PredictMD.predict_proba(nusvc_svmclassifier,testing_features_df,) +PredictMD.predict_proba(knetmlpclassifier,testing_features_df,) + +PredictMD.predict(logisticclassifier,smoted_training_features_df,) +PredictMD.predict(rfclassifier,smoted_training_features_df,) +PredictMD.predict(csvc_svmclassifier,smoted_training_features_df,) +PredictMD.predict(nusvc_svmclassifier,smoted_training_features_df,) +PredictMD.predict(knetmlpclassifier,smoted_training_features_df,) + +PredictMD.predict(logisticclassifier,testing_features_df,) +PredictMD.predict(rfclassifier,testing_features_df,) +PredictMD.predict(csvc_svmclassifier,testing_features_df,) +PredictMD.predict(nusvc_svmclassifier,testing_features_df,) +PredictMD.predict(knetmlpclassifier,testing_features_df,) diff --git a/examples/breast_cancer_biopsy/knet_mlp_classifier.jl b/examples/breast_cancer_biopsy/knet_mlp_classifier.jl new file mode 100644 index 000000000..bae4027e5 --- /dev/null +++ b/examples/breast_cancer_biopsy/knet_mlp_classifier.jl @@ -0,0 +1,242 @@ +srand(999) + +import CSV +import DataFrames +import Knet +import PredictMD + +trainingandvalidation_features_df_filename = + ENV["trainingandvalidation_features_df_filename"] +trainingandvalidation_labels_df_filename = + ENV["trainingandvalidation_labels_df_filename"] +testing_features_df_filename = + ENV["testing_features_df_filename"] +testing_labels_df_filename = + ENV["testing_labels_df_filename"] +training_features_df_filename = + ENV["training_features_df_filename"] +training_labels_df_filename = + ENV["training_labels_df_filename"] +validation_features_df_filename = + ENV["validation_features_df_filename"] +validation_labels_df_filename = + ENV["validation_labels_df_filename"] +trainingandvalidation_features_df = CSV.read( + trainingandvalidation_features_df_filename, + DataFrames.DataFrame, + ) +trainingandvalidation_labels_df = CSV.read( + trainingandvalidation_labels_df_filename, + DataFrames.DataFrame, + ) +testing_features_df = CSV.read( + testing_features_df_filename, + DataFrames.DataFrame, + ) +testing_labels_df = CSV.read( + testing_labels_df_filename, + DataFrames.DataFrame, + ) +training_features_df = CSV.read( + training_features_df_filename, + DataFrames.DataFrame, + ) +training_features_df = CSV.read( + training_features_df_filename, + DataFrames.DataFrame, + ) +validation_features_df = CSV.read( + validation_features_df_filename, + DataFrames.DataFrame, + ) +validation_labels_df = CSV.read( + validation_labels_df_filename, + DataFrames.DataFrame, + ) + +ENV["knet_mlp_classifier_filename"] = string( + tempname(), + "knet_mlp_classifier.jld2", + ) +knet_mlp_classifier_filename = ENV["knet_mlp_classifier_filename"] + +function knetmlp_predict( + w, # don't put a type annotation on this + x0::AbstractArray; + probabilities::Bool = true, + ) + # x0 = input layer + # x1 = first hidden layer + x1 = Knet.relu.( w[1]*x0 .+ w[2] ) # w[1] = weights, w[2] = biases + # x2 = second hidden layer + x2 = Knet.relu.( w[3]*x1 .+ w[4] ) # w[3] = weights, w[4] = biases + # x3 = output layer + x3 = w[5]*x2 .+ w[6] # w[5] = weights, w[6] = biases + unnormalizedlogprobs = x3 + if probabilities + normalizedlogprobs = Knet.logp(unnormalizedlogprobs, 1) + normalizedprobs = exp.(normalizedlogprobs) + return normalizedprobs + else + return unnormalizedlogprobs + end +end + +function knetmlp_loss( + predict::Function, + modelweights, # don't put a type annotation on this + x::AbstractArray, + ytrue::AbstractArray; + L1::Real = Cfloat(0), + L2::Real = Cfloat(0), + ) + loss = Knet.nll( + predict( + modelweights, + x; + probabilities = false, + ), + ytrue, + 1, # d = 1 means that instances are in columns + ) + if L1 != 0 + loss += L1 * sum(sum(abs, w_i) for w_i in modelweights[1:2:end]) + end + if L2 != 0 + loss += L2 * sum(sum(abs2, w_i) for w_i in modelweights[1:2:end]) + end + return loss +end + +knetmlp_modelweights = Any[ + # input layer has dimension contrasts.num_array_columns + # + # first hidden layer (64 neurons): + Cfloat.( + 0.1f0*randn(Cfloat,64,feature_contrasts.num_array_columns) # weights + ), + Cfloat.( + zeros(Cfloat,64,1) # biases + ), + # + # second hidden layer (32 neurons): + Cfloat.( + 0.1f0*randn(Cfloat,32,64) # weights + ), + Cfloat.( + zeros(Cfloat,32,1) # biases + ), + # + # output layer (number of neurons == number of classes): + Cfloat.( + 0.1f0*randn(Cfloat,2,32) # weights + ), + Cfloat.( + zeros(Cfloat,2,1) # biases + ), + ] + +knetmlp_losshyperparameters = Dict() +knetmlp_losshyperparameters[:L1] = Cfloat(0.0) +knetmlp_losshyperparameters[:L2] = Cfloat(0.0) + +knetmlp_optimizationalgorithm = :Momentum +knetmlp_optimizerhyperparameters = Dict() +knetmlp_minibatchsize = 48 +knetmlp_maxepochs = 1_000 + +knetmlpclassifier = PredictMD.singlelabelmulticlassdataframeknetclassifier( + featurenames, + labelname, + labellevels; + package = :Knetjl, + name = "Knet MLP", + predict = knetmlp_predict, + loss = knetmlp_loss, + losshyperparameters = knetmlp_losshyperparameters, + optimizationalgorithm = knetmlp_optimizationalgorithm, + optimizerhyperparameters = knetmlp_optimizerhyperparameters, + minibatchsize = knetmlp_minibatchsize, + modelweights = knetmlp_modelweights, + printlosseverynepochs = 100, # if 0, will not print at all + maxepochs = knetmlp_maxepochs, + feature_contrasts = feature_contrasts, + ) + +PredictMD.fit!( + knetmlpclassifier, + smoted_training_features_df, + smoted_training_labels_df, + validation_features_df, + validation_labels_df, + ) + +knet_learningcurve_lossvsepoch = PredictMD.plotlearningcurve( + knetmlpclassifier, + :loss_vs_epoch; + ) +PredictMD.open_plot(knet_learningcurve_lossvsepoch) + +knet_learningcurve_lossvsepoch_skip10epochs = PredictMD.plotlearningcurve( + knetmlpclassifier, + :loss_vs_epoch; + startat = 10, + endat = :end, + ) +PredictMD.open_plot(knet_learningcurve_lossvsepoch_skip10epochs) + +knet_learningcurve_lossvsiteration = PredictMD.plotlearningcurve( + knetmlpclassifier, + :loss_vs_iteration; + window = 50, + sampleevery = 10, + ) +PredictMD.open_plot(knet_learningcurve_lossvsiteration) + +knet_learningcurve_lossvsiteration_skip100iterations = PredictMD.plotlearningcurve( + knetmlpclassifier, + :loss_vs_iteration; + window = 50, + sampleevery = 10, + startat = 100, + endat = :end, + ) +PredictMD.open_plot(knet_learningcurve_lossvsiteration_skip100iterations) + +knetmlpclassifier_hist_training = PredictMD.plotsinglelabelbinaryclassifierhistogram( + knetmlpclassifier, + smoted_training_features_df, + smoted_training_labels_df, + labelname, + labellevels, + ) +PredictMD.open_plot(knetmlpclassifier_hist_training) + +knetmlpclassifier_hist_testing = PredictMD.plotsinglelabelbinaryclassifierhistogram( + knetmlpclassifier, + testing_features_df, + testing_labels_df, + labelname, + labellevels, + ) +PredictMD.open_plot(knetmlpclassifier_hist_testing) + +PredictMD.singlelabelbinaryclassificationmetrics( + knetmlpclassifier, + smoted_training_features_df, + smoted_training_labels_df, + labelname, + positiveclass; + sensitivity = 0.95, + ) + +PredictMD.singlelabelbinaryclassificationmetrics( + knetmlpclassifier, + testing_features_df, + testing_labels_df, + labelname, + positiveclass; + sensitivity = 0.95, + ) + +PredictMD.save_model(knetmlp_filename, knetmlpclassifier) diff --git a/examples/breast_cancer_biopsy/logistic_classifier.jl b/examples/breast_cancer_biopsy/logistic_classifier.jl new file mode 100644 index 000000000..dac48f474 --- /dev/null +++ b/examples/breast_cancer_biopsy/logistic_classifier.jl @@ -0,0 +1,181 @@ +srand(999) + +import CSV +import DataFrames +import PredictMD + +trainingandvalidation_features_df_filename = + ENV["trainingandvalidation_features_df_filename"] +trainingandvalidation_labels_df_filename = + ENV["trainingandvalidation_labels_df_filename"] +testing_features_df_filename = + ENV["testing_features_df_filename"] +testing_labels_df_filename = + ENV["testing_labels_df_filename"] +training_features_df_filename = + ENV["training_features_df_filename"] +training_labels_df_filename = + ENV["training_labels_df_filename"] +validation_features_df_filename = + ENV["validation_features_df_filename"] +validation_labels_df_filename = + ENV["validation_labels_df_filename"] +trainingandvalidation_features_df = CSV.read( + trainingandvalidation_features_df_filename, + DataFrames.DataFrame, + ) +trainingandvalidation_labels_df = CSV.read( + trainingandvalidation_labels_df_filename, + DataFrames.DataFrame, + ) +testing_features_df = CSV.read( + testing_features_df_filename, + DataFrames.DataFrame, + ) +testing_labels_df = CSV.read( + testing_labels_df_filename, + DataFrames.DataFrame, + ) +training_features_df = CSV.read( + training_features_df_filename, + DataFrames.DataFrame, + ) +training_features_df = CSV.read( + training_features_df_filename, + DataFrames.DataFrame, + ) +validation_features_df = CSV.read( + validation_features_df_filename, + DataFrames.DataFrame, + ) +validation_labels_df = CSV.read( + validation_labels_df_filename, + DataFrames.DataFrame, + ) + +smoted_training_features_df_filename = + ENV["smoted_training_features_df_filename"] +smoted_training_labels_df_filename = + ENV["smoted_training_labels_df_filename"] +smoted_training_features_df = CSV.read( + smoted_training_features_df_filename, + DataFrames.DataFrame, + ) +smoted_training_labels_df = CSV.read( + smoted_training_features_df_filename, + DataFrames.DataFrame, + ) + +ENV["logistic_classifier_filename"] = string( + tempname(), + "logistic_classifier.jld2", + ) +logistic_classifier_filename = ENV["logistic_classifier_filename"] + +logisticclassifier = PredictMD.singlelabelbinaryclassdataframelogisticclassifier( + featurenames, + labelname, + labellevels; + package = :GLMjl, + intercept = true, # optional, defaults to true + interactions = 1, # optional, defaults to 1 + name = "Logistic regression", # optional + ) + +PredictMD.fit!( + logisticclassifier, + smoted_training_features_df, + smoted_training_labels_df, + ) + +PredictMD.get_underlying(logisticclassifier) + +logistic_hist_training = PredictMD.plotsinglelabelbinaryclassifierhistogram( + logisticclassifier, + smoted_training_features_df, + smoted_training_labels_df, + labelname, + labellevels, + ) +PredictMD.open_plot(logistic_hist_training) + +logistic_hist_testing = PredictMD.plotsinglelabelbinaryclassifierhistogram( + logisticclassifier, + testing_features_df, + testing_labels_df, + labelname, + labellevels, + ) +PredictMD.open_plot(logistic_hist_testing) + +PredictMD.singlelabelbinaryclassificationmetrics( + logisticclassifier, + testing_features_df, + testing_labels_df, + labelname, + positiveclass; + sensitivity = 0.95, + ) + +PredictMD.singlelabelbinaryclassificationmetrics( + logisticclassifier, + testing_features_df, + testing_labels_df, + labelname, + positiveclass; + sensitivity = 0.95, + ) + +logistic_calibration_curve = PredictMD.plot_probability_calibration_curve( + logisticclassifier, + smoted_training_features_df, + smoted_training_labels_df, + labelname, + positiveclass; + window = 0.2, + ) +PredictMD.open_plot(logistic_calibration_curve) + +PredictMD.probability_calibration_metrics( + logisticclassifier, + testing_features_df, + testing_labels_df, + labelname, + positiveclass; + window = 0.1, + ) + +logistic_cutoffs, logistic_risk_group_prevalences = PredictMD.risk_score_cutoff_values( + logisticclassifier, + testing_features_df, + testing_labels_df, + labelname, + positiveclass; + average_function = mean, + ) +println( + string( + "Low risk: 0 to $(logistic_cutoffs[1]).", + " Medium risk: $(logistic_cutoffs[1]) to $(logistic_cutoffs[2]).", + " High risk: $(logistic_cutoffs[2]) to 1.", + ) + ) +showall(logistic_risk_group_prevalences) +logistic_cutoffs, logistic_risk_group_prevalences = PredictMD.risk_score_cutoff_values( + logisticclassifier, + testing_features_df, + testing_labels_df, + labelname, + positiveclass; + average_function = median, + ) +println( + string( + "Low risk: 0 to $(logistic_cutoffs[1]).", + " Medium risk: $(logistic_cutoffs[1]) to $(logistic_cutoffs[2]).", + " High risk: $(logistic_cutoffs[2]) to 1.", + ) + ) +showall(logistic_risk_group_prevalences) + +PredictMD.save_model(logisticclassifier_filename, logisticclassifier) diff --git a/examples/breast_cancer_biopsy/nu_svc_svm_classifier.jl b/examples/breast_cancer_biopsy/nu_svc_svm_classifier.jl new file mode 100644 index 000000000..e40c17270 --- /dev/null +++ b/examples/breast_cancer_biopsy/nu_svc_svm_classifier.jl @@ -0,0 +1,129 @@ +srand(999) + +import CSV +import DataFrames +import LIBSVM +import PredictMD + +trainingandvalidation_features_df_filename = + ENV["trainingandvalidation_features_df_filename"] +trainingandvalidation_labels_df_filename = + ENV["trainingandvalidation_labels_df_filename"] +testing_features_df_filename = + ENV["testing_features_df_filename"] +testing_labels_df_filename = + ENV["testing_labels_df_filename"] +training_features_df_filename = + ENV["training_features_df_filename"] +training_labels_df_filename = + ENV["training_labels_df_filename"] +validation_features_df_filename = + ENV["validation_features_df_filename"] +validation_labels_df_filename = + ENV["validation_labels_df_filename"] +trainingandvalidation_features_df = CSV.read( + trainingandvalidation_features_df_filename, + DataFrames.DataFrame, + ) +trainingandvalidation_labels_df = CSV.read( + trainingandvalidation_labels_df_filename, + DataFrames.DataFrame, + ) +testing_features_df = CSV.read( + testing_features_df_filename, + DataFrames.DataFrame, + ) +testing_labels_df = CSV.read( + testing_labels_df_filename, + DataFrames.DataFrame, + ) +training_features_df = CSV.read( + training_features_df_filename, + DataFrames.DataFrame, + ) +training_features_df = CSV.read( + training_features_df_filename, + DataFrames.DataFrame, + ) +validation_features_df = CSV.read( + validation_features_df_filename, + DataFrames.DataFrame, + ) +validation_labels_df = CSV.read( + validation_labels_df_filename, + DataFrames.DataFrame, + ) + +smoted_training_features_df_filename = + ENV["smoted_training_features_df_filename"] +smoted_training_labels_df_filename = + ENV["smoted_training_labels_df_filename"] +smoted_training_features_df = CSV.read( + smoted_training_features_df_filename, + DataFrames.DataFrame, + ) +smoted_training_labels_df = CSV.read( + smoted_training_features_df_filename, + DataFrames.DataFrame, + ) + +ENV["nu_svc_svm_classifier_filename"] = string( + tempname(), + "nu_svc_svm_classifier.jld2", + ) +nu_svc_svm_classifier_filename = ENV["nu_svc_svm_classifier_filename"] + +nusvc_svmclassifier = PredictMD.singlelabelmulticlassdataframesvmclassifier( + featurenames, + labelname, + labellevels; + package = :LIBSVMjl, + svmtype = LIBSVM.NuSVC, + name = "SVM (nu-SVC)", + verbose = false, + feature_contrasts = feature_contrasts, + ) + +PredictMD.fit!( + nusvc_svmclassifier, + smoted_training_features_df, + smoted_training_labels_df, + ) + +nusvc_svmclassifier_hist_training = PredictMD.plotsinglelabelbinaryclassifierhistogram( + nusvc_svmclassifier, + smoted_training_features_df, + smoted_training_labels_df, + labelname, + labellevels, + ) +PredictMD.open_plot(nusvc_svmclassifier_hist_training) + +nusvc_svmclassifier_hist_testing = PredictMD.plotsinglelabelbinaryclassifierhistogram( + nusvc_svmclassifier, + testing_features_df, + testing_labels_df, + labelname, + labellevels, + ) +PredictMD.open_plot(nusvc_svmclassifier_hist_testing) + +PredictMD.singlelabelbinaryclassificationmetrics( + nusvc_svmclassifier, + smoted_training_features_df, + smoted_training_labels_df, + labelname, + positiveclass; + sensitivity = 0.95, + ) + +PredictMD.singlelabelbinaryclassificationmetrics( + nusvc_svmclassifier, + testing_features_df, + testing_labels_df, + labelname, + positiveclass; + sensitivity = 0.95, + ) + +PredictMD.save_model(nusvc_svmclassifier_filename, nusvc_svmclassifier) diff --git a/examples/breast_cancer_biopsy/preprocess_data.jl b/examples/breast_cancer_biopsy/preprocess_data.jl new file mode 100644 index 000000000..c71a08d75 --- /dev/null +++ b/examples/breast_cancer_biopsy/preprocess_data.jl @@ -0,0 +1,135 @@ +srand(999) + +import CSV +import DataFrames +import PredictMD +import RDatasets +import StatsBase + +df = RDatasets.dataset("MASS", "biopsy") + +DataFrames.dropmissing!(df) + +PredictMD.shuffle_rows!(df) + +categoricalfeaturenames = Symbol[] +continuousfeaturenames = Symbol[ + :V1, + :V2, + :V3, + :V4, + :V5, + :V6, + :V7, + :V8, + :V9, + ] +featurenames = vcat(categoricalfeaturenames, continuousfeaturenames) + +singlelabelname = :Class +negativeclass = "benign" +positiveclass = "malignant" +singlelabellevels = [negativeclass, positiveclass] + +labelnames = [singlelabelname + +features_df = df[featurenames] +labels_df = df[labelnames] + +trainingandvalidation_features_df, + trainingandvalidation_labels_df, + testing_features_df, + testing_labels_df = PredictMD.split_data( + features_df, + labels_df, + 0.75, # 75% training+validation, 25% testing + ) +training_features_df, + training_labels_df, + validation_features_df, + validation_labels_df = PredictMD.split_data( + trainingandvalidation_features_df, + trainingandvalidation_labels_df, + 2/3, # 2/3 of 75% = 50% training, 1/3 of 75% = 25% validation + ) + +ENV["trainingandvalidation_features_df_filename"] = string( + tempname(), + "_trainingandvalidation_features_df.csv", + ) +ENV["trainingandvalidation_labels_df_filename"] = string( + tempname(), + "_trainingandvalidation_labels_df.csv", + ) +ENV["testing_features_df_filename"] = string( + tempname(), + "_testing_features_df.csv", + ) +ENV["testing_labels_df_filename"] = string( + tempname(), + "_.testing_labels_dfcsv", + ) +ENV["training_features_df_filename"] = string( + tempname(), + "_training_features_df.csv", + ) +ENV["training_labels_df_filename"] = string( + tempname(), + "_training_labels_df.csv", + ) +ENV["validation_features_df_filename"] = string( + tempname(), + "_validation_features_df.csv", + ) +ENV["validation_labels_df_filename"] = string( + tempname(), + "_validation_labels_df.csv", + ) +trainingandvalidation_features_df_filename = + ENV["trainingandvalidation_features_df_filename"] +trainingandvalidation_labels_df_filename = + ENV["trainingandvalidation_labels_df_filename"] +testing_features_df_filename = + ENV["testing_features_df_filename"] +testing_labels_df_filename = + ENV["testing_labels_df_filename"] +training_features_df_filename = + ENV["training_features_df_filename"] +training_labels_df_filename = + ENV["training_labels_df_filename"] +validation_features_df_filename = + ENV["validation_features_df_filename"] +validation_labels_df_filename = + ENV["validation_labels_df_filename"] +CSV.write( + trainingandvalidation_features_df_filename, + trainingandvalidation_features_df, + ) +CSV.write( + trainingandvalidation_labels_df_filename, + trainingandvalidation_labels_df, + ) +CSV.write( + testing_features_df_filename, + testing_features_df, + ) +CSV.write( + testing_labels_df_filename, + testing_labels_df, + ) +CSV.write( + training_features_df_filename, + training_features_df, + ) +CSV.write( + training_labels_df_filename, + training_labels_df, + ) +CSV.write( + validation_features_df_filename, + validation_features_df, + ) +CSV.write( + validation_labels_df_filename, + validation_labels_df, + ) diff --git a/examples/breast_cancer_biopsy/random_forest_classifier.jl b/examples/breast_cancer_biopsy/random_forest_classifier.jl new file mode 100644 index 000000000..bbd7739eb --- /dev/null +++ b/examples/breast_cancer_biopsy/random_forest_classifier.jl @@ -0,0 +1,129 @@ +srand(999) + + +import CSV +import DataFrames +import PredictMD + +trainingandvalidation_features_df_filename = + ENV["trainingandvalidation_features_df_filename"] +trainingandvalidation_labels_df_filename = + ENV["trainingandvalidation_labels_df_filename"] +testing_features_df_filename = + ENV["testing_features_df_filename"] +testing_labels_df_filename = + ENV["testing_labels_df_filename"] +training_features_df_filename = + ENV["training_features_df_filename"] +training_labels_df_filename = + ENV["training_labels_df_filename"] +validation_features_df_filename = + ENV["validation_features_df_filename"] +validation_labels_df_filename = + ENV["validation_labels_df_filename"] +trainingandvalidation_features_df = CSV.read( + trainingandvalidation_features_df_filename, + DataFrames.DataFrame, + ) +trainingandvalidation_labels_df = CSV.read( + trainingandvalidation_labels_df_filename, + DataFrames.DataFrame, + ) +testing_features_df = CSV.read( + testing_features_df_filename, + DataFrames.DataFrame, + ) +testing_labels_df = CSV.read( + testing_labels_df_filename, + DataFrames.DataFrame, + ) +training_features_df = CSV.read( + training_features_df_filename, + DataFrames.DataFrame, + ) +training_features_df = CSV.read( + training_features_df_filename, + DataFrames.DataFrame, + ) +validation_features_df = CSV.read( + validation_features_df_filename, + DataFrames.DataFrame, + ) +validation_labels_df = CSV.read( + validation_labels_df_filename, + DataFrames.DataFrame, + ) + +smoted_training_features_df_filename = + ENV["smoted_training_features_df_filename"] +smoted_training_labels_df_filename = + ENV["smoted_training_labels_df_filename"] +smoted_training_features_df = CSV.read( + smoted_training_features_df_filename, + DataFrames.DataFrame, + ) +smoted_training_labels_df = CSV.read( + smoted_training_features_df_filename, + DataFrames.DataFrame, + ) + +ENV["random_forest_classifier_filename"] = string( + tempname(), + "random_forest_classifier.jld2", + ) +random_forest_classifier_filename = ENV["random_forest_classifier_filename"] + +rfclassifier = PredictMD.singlelabelmulticlassdataframerandomforestclassifier( + featurenames, + labelname, + labellevels; + nsubfeatures = 4, # number of subfeatures; defaults to 2 + ntrees = 200, # number of trees; defaults to 10 + package = :DecisionTreejl, + name = "Random forest", # optional + feature_contrasts = feature_contrasts, + ) + +PredictMD.fit!( + rfclassifier, + smoted_training_features_df, + smoted_training_labels_df, + ) + +rfclassifier_hist_training = PredictMD.plotsinglelabelbinaryclassifierhistogram( + rfclassifier, + smoted_training_features_df, + smoted_training_labels_df, + labelname, + labellevels, + ) +PredictMD.open_plot(rfclassifier_hist_training) + +rfclassifier_hist_testing = PredictMD.plotsinglelabelbinaryclassifierhistogram( + rfclassifier, + testing_features_df, + testing_labels_df, + labelname, + labellevels, + ) +PredictMD.open_plot(rfclassifier_hist_testing) + +PredictMD.singlelabelbinaryclassificationmetrics( + rfclassifier, + smoted_training_features_df, + smoted_training_labels_df, + labelname, + positiveclass; + sensitivity = 0.95, + ) + +PredictMD.singlelabelbinaryclassificationmetrics( + rfclassifier, + testing_features_df, + testing_labels_df, + labelname, + positiveclass; + sensitivity = 0.95, + ) + +PredictMD.save_model(rfclassifier_filename, rfclassifier) diff --git a/examples/breast_cancer_biopsy/smote.jl b/examples/breast_cancer_biopsy/smote.jl new file mode 100644 index 000000000..0a9841290 --- /dev/null +++ b/examples/breast_cancer_biopsy/smote.jl @@ -0,0 +1,98 @@ +srand(999) + +import CSV +import DataFrames +import PredictMD +import StatsBase + +trainingandvalidation_features_df_filename = + ENV["trainingandvalidation_features_df_filename"] +trainingandvalidation_labels_df_filename = + ENV["trainingandvalidation_labels_df_filename"] +testing_features_df_filename = + ENV["testing_features_df_filename"] +testing_labels_df_filename = + ENV["testing_labels_df_filename"] +training_features_df_filename = + ENV["training_features_df_filename"] +training_labels_df_filename = + ENV["training_labels_df_filename"] +validation_features_df_filename = + ENV["validation_features_df_filename"] +validation_labels_df_filename = + ENV["validation_labels_df_filename"] +trainingandvalidation_features_df = CSV.read( + trainingandvalidation_features_df_filename, + DataFrames.DataFrame, + ) +trainingandvalidation_labels_df = CSV.read( + trainingandvalidation_labels_df_filename, + DataFrames.DataFrame, + ) +testing_features_df = CSV.read( + testing_features_df_filename, + DataFrames.DataFrame, + ) +testing_labels_df = CSV.read( + testing_labels_df_filename, + DataFrames.DataFrame, + ) +training_features_df = CSV.read( + training_features_df_filename, + DataFrames.DataFrame, + ) +training_features_df = CSV.read( + training_features_df_filename, + DataFrames.DataFrame, + ) +validation_features_df = CSV.read( + validation_features_df_filename, + DataFrames.DataFrame, + ) +validation_labels_df = CSV.read( + validation_labels_df_filename, + DataFrames.DataFrame, + ) + + +DataFrames.describe(training_labels_df[labelname]) +StatsBase.countmap(training_labels_df[labelname]) + +majorityclass = "benign" +minorityclass = "malignant" + +smoted_training_features_df, smoted_training_labels_df = PredictMD.smote( + training_features_df, + training_labels_df, + featurenames, + labelname; + majorityclass = majorityclass, + minorityclass = minorityclass, + pct_over = 100, # how much to oversample the minority class + minority_to_majority_ratio = 1.0, # desired minority:majority ratio + k = 5, + ) + +DataFrames.describe(smoted_training_labels_df[labelname]) +StatsBase.countmap(smoted_training_labels_df[labelname]) + +ENV["smoted_training_features_df_filename"] = string( + tempname(), + "_smoted_training_features_df.csv", + ) +ENV["smoted_training_labels_df_filename"] = string( + tempname(), + "_smoted_training_labels_df.csv", + ) +smoted_training_features_df_filename = + ENV["smoted_training_features_df_filename"] +smoted_training_labels_df_filename = + ENV["smoted_training_labels_df_filename"] +CSV.write( + smoted_training_features_df_filename, + smoted_training_features_df, + ) +CSV.write( + smoted_training_labels_df_filename, + smoted_training_labels_df, + ) From 0c2a073102d216518957b427bf99e7fb9e3ea0b6 Mon Sep 17 00:00:00 2001 From: Dilum Aluthge Date: Tue, 22 May 2018 00:06:37 -0400 Subject: [PATCH 38/62] Remove empty file --- examples/breast_cancer_biopsy/OLD_breast_cancer_biopsy.jl | 0 1 file changed, 0 insertions(+), 0 deletions(-) delete mode 100644 examples/breast_cancer_biopsy/OLD_breast_cancer_biopsy.jl diff --git a/examples/breast_cancer_biopsy/OLD_breast_cancer_biopsy.jl b/examples/breast_cancer_biopsy/OLD_breast_cancer_biopsy.jl deleted file mode 100644 index e69de29bb..000000000 From 7d98055b9bbb8f3cfeab426cff17605c5b67981b Mon Sep 17 00:00:00 2001 From: Dilum Aluthge Date: Tue, 22 May 2018 00:26:09 -0400 Subject: [PATCH 39/62] Fix bugs --- examples/boston_housing/compare_models.jl | 18 ++++------ examples/boston_housing/get_model_output.jl | 6 ++-- .../boston_housing/knet_mlp_regression.jl | 36 ++++++++++++++----- examples/boston_housing/linear_regression.jl | 33 +++++++++++++---- .../random_forest_regression.jl | 33 +++++++++++++---- 5 files changed, 91 insertions(+), 35 deletions(-) diff --git a/examples/boston_housing/compare_models.jl b/examples/boston_housing/compare_models.jl index 0a1ec1519..72bd38168 100644 --- a/examples/boston_housing/compare_models.jl +++ b/examples/boston_housing/compare_models.jl @@ -59,7 +59,7 @@ random_forest_regression_filename = ENV["random_forest_regression_filename"] knet_mlp_regression_filename = ENV["knet_mlp_regression_filename"] linear_regression = PredictMD.load_model(linear_regression_filename) -random_forest_reg = PredictMD.load_model(random_forest_reg_filename) +random_forest_regression = PredictMD.load_model(random_forest_regression_filename) function knetmlp_predict( w, # don't put a type annotation on this @@ -101,28 +101,22 @@ knet_mlp_regression = PredictMD.load_model(knet_mlp_regression_filename) all_models = PredictMD.Fittable[ linear_regression, - random_forest_reg, + random_forest_regression, knet_mlp_regression, ] +singlelabelname = :MedV + showall(PredictMD.singlelabelregressionmetrics( all_models, training_features_df, training_labels_df, - labelname, + singlelabelname, )) showall(PredictMD.singlelabelregressionmetrics( all_models, testing_features_df, testing_labels_df, - labelname, + singlelabelname, )) - -PredictMD.predict(linear_regression,training_features_df,) -PredictMD.predict(random_forest_reg,training_features_df,) -PredictMD.predict(knet_mlp_regression,training_features_df,) - -PredictMD.predict(linear_regression,testing_features_df,) -PredictMD.predict(random_forest_reg,testing_features_df,) -PredictMD.predict(knet_mlp_regression,testing_features_df,) diff --git a/examples/boston_housing/get_model_output.jl b/examples/boston_housing/get_model_output.jl index f86c3aa9f..d276065ec 100644 --- a/examples/boston_housing/get_model_output.jl +++ b/examples/boston_housing/get_model_output.jl @@ -59,7 +59,7 @@ random_forest_regression_filename = ENV["random_forest_regression_filename"] knet_mlp_regression_filename = ENV["knet_mlp_regression_filename"] linear_regression = PredictMD.load_model(linear_regression_filename) -random_forest_reg = PredictMD.load_model(random_forest_reg_filename) +random_forest_regression = PredictMD.load_model(random_forest_regression_filename) function knetmlp_predict( w, # don't put a type annotation on this @@ -100,9 +100,9 @@ end knet_mlp_regression = PredictMD.load_model(knet_mlp_regression_filename) PredictMD.predict(linear_regression,training_features_df,) -PredictMD.predict(random_forest_reg,training_features_df,) +PredictMD.predict(random_forest_regression,training_features_df,) PredictMD.predict(knet_mlp_regression,training_features_df,) PredictMD.predict(linear_regression,testing_features_df,) -PredictMD.predict(random_forest_reg,testing_features_df,) +PredictMD.predict(random_forest_regression,testing_features_df,) PredictMD.predict(knet_mlp_regression,testing_features_df,) diff --git a/examples/boston_housing/knet_mlp_regression.jl b/examples/boston_housing/knet_mlp_regression.jl index 277cdf1eb..340de0dc0 100644 --- a/examples/boston_housing/knet_mlp_regression.jl +++ b/examples/boston_housing/knet_mlp_regression.jl @@ -54,11 +54,32 @@ validation_labels_df = CSV.read( DataFrames.DataFrame, ) +categoricalfeaturenames = Symbol[] +continuousfeaturenames = Symbol[ + :Crim, + :Zn, + :Indus, + :Chas, + :NOx, + :Rm, + :Age, + :Dis, + :Rad, + :Tax, + :PTRatio, + :Black, + :LStat, + ] +featurenames = vcat(categoricalfeaturenames, continuousfeaturenames) + +singlelabelname = :MedV +labelnames = [singlelabelname] + ENV["knet_mlp_regression_filename"] = string( tempname(), "_knet_mlp_regression.jld2", ) -knet_mlp_regression = ENV["knet_mlp_regression_filename"] +knet_mlp_regression_filename = ENV["knet_mlp_regression_filename"] function knetmlp_predict( w, # don't put a type annotation on this @@ -128,7 +149,7 @@ feature_contrasts = PredictMD.generate_feature_contrasts(training_features_df, f knet_mlp_regression = PredictMD.singlelabeldataframeknetregression( featurenames, - labelname; + singlelabelname; package = :Knetjl, name = "Knet MLP", predict = knetmlp_predict, @@ -151,7 +172,6 @@ PredictMD.fit!( validation_labels_df, ) - knet_learningcurve_lossvsepoch = PredictMD.plotlearningcurve( knet_mlp_regression, :loss_vs_epoch; @@ -188,7 +208,7 @@ knet_mlp_regression_plot_training = PredictMD.plotsinglelabelregressiontrueversu knet_mlp_regression, training_features_df, training_labels_df, - labelname, + singlelabelname, ) PredictMD.open_plot(knet_mlp_regression_plot_training) @@ -196,7 +216,7 @@ knet_mlp_regression_plot_testing = PredictMD.plotsinglelabelregressiontrueversus knet_mlp_regression, testing_features_df, testing_labels_df, - labelname, + singlelabelname, ) PredictMD.open_plot(knet_mlp_regression_plot_testing) @@ -204,14 +224,14 @@ PredictMD.singlelabelregressionmetrics( knet_mlp_regression, training_features_df, training_labels_df, - labelname, + singlelabelname, ) PredictMD.singlelabelregressionmetrics( knet_mlp_regression, testing_features_df, testing_labels_df, - labelname, + singlelabelname, ) -PredictMD.save_model(knet_mlp_regression, knet_mlp_regression) +PredictMD.save_model(knet_mlp_regression_filename, knet_mlp_regression) diff --git a/examples/boston_housing/linear_regression.jl b/examples/boston_housing/linear_regression.jl index a47ed1425..22fe04cbe 100644 --- a/examples/boston_housing/linear_regression.jl +++ b/examples/boston_housing/linear_regression.jl @@ -53,15 +53,36 @@ validation_labels_df = CSV.read( DataFrames.DataFrame, ) +categoricalfeaturenames = Symbol[] +continuousfeaturenames = Symbol[ + :Crim, + :Zn, + :Indus, + :Chas, + :NOx, + :Rm, + :Age, + :Dis, + :Rad, + :Tax, + :PTRatio, + :Black, + :LStat, + ] +featurenames = vcat(categoricalfeaturenames, continuousfeaturenames) + +singlelabelname = :MedV +labelnames = [singlelabelname] + ENV["linear_regression_filename"] = string( tempname(), "_linear_regression.jld2", ) linear_regression_filename = ENV["linear_regression_filename"] -linear_regression = PredictMD.singlelabeldataframelinear_regressionression( +linear_regression = PredictMD.singlelabeldataframelinearregression( featurenames, - labelname; + singlelabelname; package = :GLMjl, intercept = true, # optional, defaults to true interactions = 2, # optional, defaults to 1 @@ -76,7 +97,7 @@ linear_regression_plot_training = PredictMD.plotsinglelabelregressiontrueversusp linear_regression, training_features_df, training_labels_df, - labelname, + singlelabelname, ) PredictMD.open_plot(linear_regression_plot_training) @@ -84,7 +105,7 @@ linear_regression_plot_testing = PredictMD.plotsinglelabelregressiontrueversuspr linear_regression, testing_features_df, testing_labels_df, - labelname + singlelabelname ) PredictMD.open_plot(linear_regression_plot_testing) @@ -92,14 +113,14 @@ PredictMD.singlelabelregressionmetrics( linear_regression, training_features_df, training_labels_df, - labelname, + singlelabelname, ) PredictMD.singlelabelregressionmetrics( linear_regression, testing_features_df, testing_labels_df, - labelname, + singlelabelname, ) PredictMD.save_model(linear_regression_filename, linear_regression) diff --git a/examples/boston_housing/random_forest_regression.jl b/examples/boston_housing/random_forest_regression.jl index 41ad9a7e5..85465e745 100644 --- a/examples/boston_housing/random_forest_regression.jl +++ b/examples/boston_housing/random_forest_regression.jl @@ -53,6 +53,27 @@ validation_labels_df = CSV.read( DataFrames.DataFrame, ) +categoricalfeaturenames = Symbol[] +continuousfeaturenames = Symbol[ + :Crim, + :Zn, + :Indus, + :Chas, + :NOx, + :Rm, + :Age, + :Dis, + :Rad, + :Tax, + :PTRatio, + :Black, + :LStat, + ] +featurenames = vcat(categoricalfeaturenames, continuousfeaturenames) + +singlelabelname = :MedV +labelnames = [singlelabelname] + ENV["random_forest_regression_filename"] = string( tempname(), "_random_forest_regression.jld2", @@ -61,9 +82,9 @@ random_forest_regression_filename = ENV["random_forest_regression_filename"] feature_contrasts = PredictMD.generate_feature_contrasts(training_features_df, featurenames) -random_forest_regression = PredictMD.singlelabeldataframerandom_forest_regressionression( +random_forest_regression = PredictMD.singlelabeldataframerandomforestregression( featurenames, - labelname; + singlelabelname; nsubfeatures = 2, # number of subfeatures; defaults to 2 ntrees = 20, # number of trees; defaults to 10 package = :DecisionTreejl, @@ -77,7 +98,7 @@ random_forest_regression_plot_training = PredictMD.plotsinglelabelregressiontrue random_forest_regression, training_features_df, training_labels_df, - labelname, + singlelabelname, ) PredictMD.open_plot(random_forest_regression_plot_training) @@ -85,7 +106,7 @@ random_forest_regression_plot_testing = PredictMD.plotsinglelabelregressiontruev random_forest_regression, testing_features_df, testing_labels_df, - labelname, + singlelabelname, ) PredictMD.open_plot(random_forest_regression_plot_testing) @@ -93,14 +114,14 @@ PredictMD.singlelabelregressionmetrics( random_forest_regression, training_features_df, training_labels_df, - labelname, + singlelabelname, ) PredictMD.singlelabelregressionmetrics( random_forest_regression, testing_features_df, testing_labels_df, - labelname, + singlelabelname, ) PredictMD.save_model(random_forest_regression_filename, random_forest_regression) From bfa83a4f3bb13d1b77ec737f93ba20bf6d68c1ef Mon Sep 17 00:00:00 2001 From: Dilum Aluthge Date: Tue, 22 May 2018 00:48:36 -0400 Subject: [PATCH 40/62] Progress commit --- REQUIRE | 1 - docs/make_docs.jl | 284 +++++++++++++++++- ...eprocess_data.jl => 01_preprocess_data.jl} | 0 ..._regression.jl => 02_linear_regression.jl} | 0 ...sion.jl => 03_random_forest_regression.jl} | 0 ...egression.jl => 04_knet_mlp_regression.jl} | 0 ...compare_models.jl => 05_compare_models.jl} | 0 ...model_output.jl => 06_get_model_output.jl} | 0 .../c_svc_svm_classifier.jl | 39 ++- .../breast_cancer_biopsy/compare_models.jl | 26 +- .../knet_mlp_classifier.jl | 53 +++- .../logistic_classifier.jl | 47 ++- .../nu_svc_svm_classifier.jl | 39 ++- .../breast_cancer_biopsy/preprocess_data.jl | 2 +- .../random_forest_classifier.jl | 40 ++- examples/breast_cancer_biopsy/smote.jl | 28 +- 16 files changed, 486 insertions(+), 73 deletions(-) rename examples/boston_housing/{preprocess_data.jl => 01_preprocess_data.jl} (100%) rename examples/boston_housing/{linear_regression.jl => 02_linear_regression.jl} (100%) rename examples/boston_housing/{random_forest_regression.jl => 03_random_forest_regression.jl} (100%) rename examples/boston_housing/{knet_mlp_regression.jl => 04_knet_mlp_regression.jl} (100%) rename examples/boston_housing/{compare_models.jl => 05_compare_models.jl} (100%) rename examples/boston_housing/{get_model_output.jl => 06_get_model_output.jl} (100%) diff --git a/REQUIRE b/REQUIRE index 33908aeeb..4a5aedbaf 100644 --- a/REQUIRE +++ b/REQUIRE @@ -27,7 +27,6 @@ NNlib 0.2 NumericalIntegration 0.0.3 PGFPlots 2.2 PGFPlotsX 0.2 -PackageCompiler 0.3 ProgressMeter 0.5 RDatasets 0.3 ROCAnalysis 0.2 diff --git a/docs/make_docs.jl b/docs/make_docs.jl index 5f4ab948c..5f2f04ce8 100644 --- a/docs/make_docs.jl +++ b/docs/make_docs.jl @@ -7,43 +7,305 @@ import PredictMD info("DEBUG: using Literate.jl to generate examples") -examples_output_parent_directory = joinpath( +examples_input_parent_directory = joinpath( @__DIR__, - "src", + "..", "examples", ) -examples_input_parent_directory = joinpath( +examples_output_parent_directory = joinpath( @__DIR__, - "..", + "src", "examples", ) -boston_housing_output_directory = joinpath( - examples_output_parent_directory, - "boston_housing", - ) boston_housing_input_directory = joinpath( examples_input_parent_directory, "boston_housing", ) +boston_housing_output_directory = joinpath( + examples_output_parent_directory, + "boston_housing", + ) Literate.markdown( - joinpath(boston_housing_input_directory, "boston_housing.jl"), + joinpath(boston_housing_input_directory, ""), + boston_housing_output_directory; + documenter = true, + ) +Literate.notebook( + joinpath(boston_housing_input_directory, ""), + boston_housing_output_directory; + documenter = true, + execute = false, + ) +Literate.script( + joinpath(boston_housing_input_directory, ""), + boston_housing_output_directory; + documenter = true, + keep_comments = true, + ) +# +Literate.markdown( + joinpath(boston_housing_input_directory, ""), + boston_housing_output_directory; + documenter = true, + ) +Literate.notebook( + joinpath(boston_housing_input_directory, ""), + boston_housing_output_directory; + documenter = true, + execute = false, + ) +Literate.script( + joinpath(boston_housing_input_directory, ""), + boston_housing_output_directory; + documenter = true, + keep_comments = true, + ) +# +Literate.markdown( + joinpath(boston_housing_input_directory, ""), + boston_housing_output_directory; + documenter = true, + ) +Literate.notebook( + joinpath(boston_housing_input_directory, ""), + boston_housing_output_directory; + documenter = true, + execute = false, + ) +Literate.script( + joinpath(boston_housing_input_directory, ""), + boston_housing_output_directory; + documenter = true, + keep_comments = true, + ) +# +Literate.markdown( + joinpath(boston_housing_input_directory, ""), + boston_housing_output_directory; + documenter = true, + ) +Literate.notebook( + joinpath(boston_housing_input_directory, ""), + boston_housing_output_directory; + documenter = true, + execute = false, + ) +Literate.script( + joinpath(boston_housing_input_directory, ""), + boston_housing_output_directory; + documenter = true, + keep_comments = true, + ) +# +Literate.markdown( + joinpath(boston_housing_input_directory, ""), boston_housing_output_directory; documenter = true, ) Literate.notebook( - joinpath(boston_housing_input_directory, "boston_housing.jl"), + joinpath(boston_housing_input_directory, ""), boston_housing_output_directory; documenter = true, execute = false, ) Literate.script( - joinpath(boston_housing_input_directory, "boston_housing.jl"), + joinpath(boston_housing_input_directory, ""), boston_housing_output_directory; documenter = true, keep_comments = true, ) +# +Literate.markdown( + joinpath(boston_housing_input_directory, ""), + boston_housing_output_directory; + documenter = true, + ) +Literate.notebook( + joinpath(boston_housing_input_directory, ""), + boston_housing_output_directory; + documenter = true, + execute = false, + ) +Literate.script( + joinpath(boston_housing_input_directory, ""), + boston_housing_output_directory; + documenter = true, + keep_comments = true, + ) + +breast_cancer_biopsy_input_directory = joinpath( + examples_input_parent_directory, + "breast_cancer_biopsy", + ) +breast_cancer_biopsy_output_directory = joinpath( + examples_output_parent_directory, + "breast_cancer_biopsyg", + ) + +Literate.markdown( + joinpath(breast_cancer_biopsy_input_directory, ""), + breast_cancer_biopsy_output_directory; + documenter = true, + ) +Literate.notebook( + joinpath(breast_cancer_biopsy_input_directory, ""), + breast_cancer_biopsy_output_directory; + documenter = true, + execute = false, + ) +Literate.script( + joinpath(breast_cancer_biopsy_input_directory, ""), + breast_cancer_biopsy_output_directory; + documenter = true, + keep_comments = true, + ) +# +Literate.markdown( + joinpath(breast_cancer_biopsy_input_directory, ""), + breast_cancer_biopsy_output_directory; + documenter = true, + ) +Literate.notebook( + joinpath(breast_cancer_biopsy_input_directory, ""), + breast_cancer_biopsy_output_directory; + documenter = true, + execute = false, + ) +Literate.script( + joinpath(breast_cancer_biopsy_input_directory, ""), + breast_cancer_biopsy_output_directory; + documenter = true, + keep_comments = true, + ) +# +Literate.markdown( + joinpath(breast_cancer_biopsy_input_directory, ""), + breast_cancer_biopsy_output_directory; + documenter = true, + ) +Literate.notebook( + joinpath(breast_cancer_biopsy_input_directory, ""), + breast_cancer_biopsy_output_directory; + documenter = true, + execute = false, + ) +Literate.script( + joinpath(breast_cancer_biopsy_input_directory, ""), + breast_cancer_biopsy_output_directory; + documenter = true, + keep_comments = true, + ) +# +Literate.markdown( + joinpath(breast_cancer_biopsy_input_directory, ""), + breast_cancer_biopsy_output_directory; + documenter = true, + ) +Literate.notebook( + joinpath(breast_cancer_biopsy_input_directory, ""), + breast_cancer_biopsy_output_directory; + documenter = true, + execute = false, + ) +Literate.script( + joinpath(breast_cancer_biopsy_input_directory, ""), + breast_cancer_biopsy_output_directory; + documenter = true, + keep_comments = true, + ) +# +Literate.markdown( + joinpath(breast_cancer_biopsy_input_directory, ""), + breast_cancer_biopsy_output_directory; + documenter = true, + ) +Literate.notebook( + joinpath(breast_cancer_biopsy_input_directory, ""), + breast_cancer_biopsy_output_directory; + documenter = true, + execute = false, + ) +Literate.script( + joinpath(breast_cancer_biopsy_input_directory, ""), + breast_cancer_biopsy_output_directory; + documenter = true, + keep_comments = true, + ) +# +Literate.markdown( + joinpath(breast_cancer_biopsy_input_directory, ""), + breast_cancer_biopsy_output_directory; + documenter = true, + ) +Literate.notebook( + joinpath(breast_cancer_biopsy_input_directory, ""), + breast_cancer_biopsy_output_directory; + documenter = true, + execute = false, + ) +Literate.script( + joinpath(breast_cancer_biopsy_input_directory, ""), + breast_cancer_biopsy_output_directory; + documenter = true, + keep_comments = true, + ) +# +Literate.markdown( + joinpath(breast_cancer_biopsy_input_directory, ""), + breast_cancer_biopsy_output_directory; + documenter = true, + ) +Literate.notebook( + joinpath(breast_cancer_biopsy_input_directory, ""), + breast_cancer_biopsy_output_directory; + documenter = true, + execute = false, + ) +Literate.script( + joinpath(breast_cancer_biopsy_input_directory, ""), + breast_cancer_biopsy_output_directory; + documenter = true, + keep_comments = true, + ) +# +Literate.markdown( + joinpath(breast_cancer_biopsy_input_directory, ""), + breast_cancer_biopsy_output_directory; + documenter = true, + ) +Literate.notebook( + joinpath(breast_cancer_biopsy_input_directory, ""), + breast_cancer_biopsy_output_directory; + documenter = true, + execute = false, + ) +Literate.script( + joinpath(breast_cancer_biopsy_input_directory, ""), + breast_cancer_biopsy_output_directory; + documenter = true, + keep_comments = true, + ) +# +Literate.markdown( + joinpath(breast_cancer_biopsy_input_directory, ""), + breast_cancer_biopsy_output_directory; + documenter = true, + ) +Literate.notebook( + joinpath(breast_cancer_biopsy_input_directory, ""), + breast_cancer_biopsy_output_directory; + documenter = true, + execute = false, + ) +Literate.script( + joinpath(breast_cancer_biopsy_input_directory, ""), + breast_cancer_biopsy_output_directory; + documenter = true, + keep_comments = true, + ) + info("DEBUG: using Documenter.jl to generate Markdown docs") diff --git a/examples/boston_housing/preprocess_data.jl b/examples/boston_housing/01_preprocess_data.jl similarity index 100% rename from examples/boston_housing/preprocess_data.jl rename to examples/boston_housing/01_preprocess_data.jl diff --git a/examples/boston_housing/linear_regression.jl b/examples/boston_housing/02_linear_regression.jl similarity index 100% rename from examples/boston_housing/linear_regression.jl rename to examples/boston_housing/02_linear_regression.jl diff --git a/examples/boston_housing/random_forest_regression.jl b/examples/boston_housing/03_random_forest_regression.jl similarity index 100% rename from examples/boston_housing/random_forest_regression.jl rename to examples/boston_housing/03_random_forest_regression.jl diff --git a/examples/boston_housing/knet_mlp_regression.jl b/examples/boston_housing/04_knet_mlp_regression.jl similarity index 100% rename from examples/boston_housing/knet_mlp_regression.jl rename to examples/boston_housing/04_knet_mlp_regression.jl diff --git a/examples/boston_housing/compare_models.jl b/examples/boston_housing/05_compare_models.jl similarity index 100% rename from examples/boston_housing/compare_models.jl rename to examples/boston_housing/05_compare_models.jl diff --git a/examples/boston_housing/get_model_output.jl b/examples/boston_housing/06_get_model_output.jl similarity index 100% rename from examples/boston_housing/get_model_output.jl rename to examples/boston_housing/06_get_model_output.jl diff --git a/examples/breast_cancer_biopsy/c_svc_svm_classifier.jl b/examples/breast_cancer_biopsy/c_svc_svm_classifier.jl index fc6eb6b27..b2a88ecf6 100644 --- a/examples/breast_cancer_biopsy/c_svc_svm_classifier.jl +++ b/examples/breast_cancer_biopsy/c_svc_svm_classifier.jl @@ -67,16 +67,39 @@ smoted_training_labels_df = CSV.read( DataFrames.DataFrame, ) +categoricalfeaturenames = Symbol[] +continuousfeaturenames = Symbol[ + :V1, + :V2, + :V3, + :V4, + :V5, + :V6, + :V7, + :V8, + :V9, + ] +featurenames = vcat(categoricalfeaturenames, continuousfeaturenames) + +singlelabelname = :Class +negativeclass = "benign" +positiveclass = "malignant" +singlelabellevels = [negativeclass, positiveclass] ENV["c_svc_svm_classifier_filename"] = string( tempname(), "c_svc_svm_classifier.jld2", ) c_svc_svm_classifier_filename = ENV["c_svc_svm_classifier_filename"] +feature_contrasts = PredictMD.generate_feature_contrasts( + smoted_training_features_df, + featurenames, + ) + csvc_svmclassifier = PredictMD.singlelabelmulticlassdataframesvmclassifier( featurenames, - labelname, - labellevels; + singlelabelname, + singlelabellevels; package = :LIBSVMjl, svmtype = LIBSVM.SVC, name = "SVM (C-SVC)", @@ -94,8 +117,8 @@ csvc_svmclassifier_hist_training = PredictMD.plotsinglelabelbinaryclassifierhist csvc_svmclassifier, smoted_training_features_df, smoted_training_labels_df, - labelname, - labellevels, + singlelabelname, + singlelabellevels, ) PredictMD.open_plot(csvc_svmclassifier_hist_training) @@ -103,8 +126,8 @@ csvc_svmclassifier_hist_testing = PredictMD.plotsinglelabelbinaryclassifierhisto csvc_svmclassifier, testing_features_df, testing_labels_df, - labelname, - labellevels, + singlelabelname, + singlelabellevels, ) PredictMD.open_plot(csvc_svmclassifier_hist_testing) @@ -112,7 +135,7 @@ PredictMD.singlelabelbinaryclassificationmetrics( csvc_svmclassifier, smoted_training_features_df, smoted_training_labels_df, - labelname, + singlelabelname, positiveclass; sensitivity = 0.95, ) @@ -121,7 +144,7 @@ PredictMD.singlelabelbinaryclassificationmetrics( csvc_svmclassifier, testing_features_df, testing_labels_df, - labelname, + singlelabelname, positiveclass; sensitivity = 0.95, ) diff --git a/examples/breast_cancer_biopsy/compare_models.jl b/examples/breast_cancer_biopsy/compare_models.jl index e18144966..f43cc8c32 100644 --- a/examples/breast_cancer_biopsy/compare_models.jl +++ b/examples/breast_cancer_biopsy/compare_models.jl @@ -59,7 +59,7 @@ rfclassifier = PredictMD.load_model(rfclassifier_filename) csvc_svmclassifier = PredictMD.load_model(csvc_svmclassifier_filename) nusvc_svmclassifier = PredictMD.load_model(nusvc_svmclassifier_filename) -unction knetmlp_predict( +function knetmlp_predict( w, # don't put a type annotation on this x0::AbstractArray; probabilities::Bool = true, @@ -117,11 +117,15 @@ all_models = PredictMD.Fittable[ knetmlpclassifier, ] +singlelabelname = :Class +negativeclass = "benign" +positiveclass = "malignant" + showall(PredictMD.singlelabelbinaryclassificationmetrics( all_models, training_features_df, training_labels_df, - labelname, + singlelabelname, positiveclass; sensitivity = 0.95, )) @@ -129,7 +133,7 @@ showall(PredictMD.singlelabelbinaryclassificationmetrics( all_models, training_features_df, training_labels_df, - labelname, + singlelabelname, positiveclass; specificity = 0.95, )) @@ -137,7 +141,7 @@ showall(PredictMD.singlelabelbinaryclassificationmetrics( all_models, training_features_df, training_labels_df, - labelname, + singlelabelname, positiveclass; maximize = :f1score, )) @@ -145,7 +149,7 @@ showall(PredictMD.singlelabelbinaryclassificationmetrics( all_models, training_features_df, training_labels_df, - labelname, + singlelabelname, positiveclass; maximize = :cohen_kappa, )) @@ -154,7 +158,7 @@ showall(PredictMD.singlelabelbinaryclassificationmetrics( all_models, testing_features_df, testing_labels_df, - labelname, + singlelabelname, positiveclass; sensitivity = 0.95, )) @@ -162,7 +166,7 @@ showall(PredictMD.singlelabelbinaryclassificationmetrics( all_models, testing_features_df, testing_labels_df, - labelname, + singlelabelname, positiveclass; specificity = 0.95, )) @@ -170,7 +174,7 @@ showall(PredictMD.singlelabelbinaryclassificationmetrics( all_models, testing_features_df, testing_labels_df, - labelname, + singlelabelname, positiveclass; maximize = :f1score, )) @@ -178,7 +182,7 @@ showall(PredictMD.singlelabelbinaryclassificationmetrics( all_models, testing_features_df, testing_labels_df, - labelname, + singlelabelname, positiveclass; maximize = :cohen_kappa, )) @@ -187,7 +191,7 @@ rocplottesting = PredictMD.plotroccurves( all_models, testing_features_df, testing_labels_df, - labelname, + singlelabelname, positiveclass, ) PredictMD.open_plot(rocplottesting) @@ -196,7 +200,7 @@ prplottesting = PredictMD.plotprcurves( all_models, testing_features_df, testing_labels_df, - labelname, + singlelabelname, positiveclass, ) PredictMD.open_plot(prplottesting) diff --git a/examples/breast_cancer_biopsy/knet_mlp_classifier.jl b/examples/breast_cancer_biopsy/knet_mlp_classifier.jl index bae4027e5..77b6c60f2 100644 --- a/examples/breast_cancer_biopsy/knet_mlp_classifier.jl +++ b/examples/breast_cancer_biopsy/knet_mlp_classifier.jl @@ -54,6 +54,38 @@ validation_labels_df = CSV.read( DataFrames.DataFrame, ) +smoted_training_features_df_filename = + ENV["smoted_training_features_df_filename"] +smoted_training_labels_df_filename = + ENV["smoted_training_labels_df_filename"] +smoted_training_features_df = CSV.read( + smoted_training_features_df_filename, + DataFrames.DataFrame, + ) +smoted_training_labels_df = CSV.read( + smoted_training_features_df_filename, + DataFrames.DataFrame, + ) + +categoricalfeaturenames = Symbol[] +continuousfeaturenames = Symbol[ + :V1, + :V2, + :V3, + :V4, + :V5, + :V6, + :V7, + :V8, + :V9, + ] +featurenames = vcat(categoricalfeaturenames, continuousfeaturenames) + +singlelabelname = :Class +negativeclass = "benign" +positiveclass = "malignant" +singlelabellevels = [negativeclass, positiveclass] + ENV["knet_mlp_classifier_filename"] = string( tempname(), "knet_mlp_classifier.jld2", @@ -145,10 +177,15 @@ knetmlp_optimizerhyperparameters = Dict() knetmlp_minibatchsize = 48 knetmlp_maxepochs = 1_000 +feature_contrasts = PredictMD.generate_feature_contrasts( + smoted_training_features_df, + featurenames, + ) + knetmlpclassifier = PredictMD.singlelabelmulticlassdataframeknetclassifier( featurenames, - labelname, - labellevels; + singlelabelname, + singlelabellevels; package = :Knetjl, name = "Knet MLP", predict = knetmlp_predict, @@ -207,8 +244,8 @@ knetmlpclassifier_hist_training = PredictMD.plotsinglelabelbinaryclassifierhisto knetmlpclassifier, smoted_training_features_df, smoted_training_labels_df, - labelname, - labellevels, + singlelabelname, + singlelabellevels, ) PredictMD.open_plot(knetmlpclassifier_hist_training) @@ -216,8 +253,8 @@ knetmlpclassifier_hist_testing = PredictMD.plotsinglelabelbinaryclassifierhistog knetmlpclassifier, testing_features_df, testing_labels_df, - labelname, - labellevels, + singlelabelname, + singlelabellevels, ) PredictMD.open_plot(knetmlpclassifier_hist_testing) @@ -225,7 +262,7 @@ PredictMD.singlelabelbinaryclassificationmetrics( knetmlpclassifier, smoted_training_features_df, smoted_training_labels_df, - labelname, + singlelabelname, positiveclass; sensitivity = 0.95, ) @@ -234,7 +271,7 @@ PredictMD.singlelabelbinaryclassificationmetrics( knetmlpclassifier, testing_features_df, testing_labels_df, - labelname, + singlelabelname, positiveclass; sensitivity = 0.95, ) diff --git a/examples/breast_cancer_biopsy/logistic_classifier.jl b/examples/breast_cancer_biopsy/logistic_classifier.jl index dac48f474..4b4c6657b 100644 --- a/examples/breast_cancer_biopsy/logistic_classifier.jl +++ b/examples/breast_cancer_biopsy/logistic_classifier.jl @@ -66,16 +66,39 @@ smoted_training_labels_df = CSV.read( DataFrames.DataFrame, ) +categoricalfeaturenames = Symbol[] +continuousfeaturenames = Symbol[ + :V1, + :V2, + :V3, + :V4, + :V5, + :V6, + :V7, + :V8, + :V9, + ] +featurenames = vcat(categoricalfeaturenames, continuousfeaturenames) + +singlelabelname = :Class +negativeclass = "benign" +positiveclass = "malignant" +singlelabellevels = [negativeclass, positiveclass] ENV["logistic_classifier_filename"] = string( tempname(), "logistic_classifier.jld2", ) logistic_classifier_filename = ENV["logistic_classifier_filename"] +feature_contrasts = PredictMD.generate_feature_contrasts( + smoted_training_features_df, + featurenames, + ) + logisticclassifier = PredictMD.singlelabelbinaryclassdataframelogisticclassifier( featurenames, - labelname, - labellevels; + singlelabelname, + singlelabellevels; package = :GLMjl, intercept = true, # optional, defaults to true interactions = 1, # optional, defaults to 1 @@ -94,8 +117,8 @@ logistic_hist_training = PredictMD.plotsinglelabelbinaryclassifierhistogram( logisticclassifier, smoted_training_features_df, smoted_training_labels_df, - labelname, - labellevels, + singlelabelname, + singlelabellevels, ) PredictMD.open_plot(logistic_hist_training) @@ -103,8 +126,8 @@ logistic_hist_testing = PredictMD.plotsinglelabelbinaryclassifierhistogram( logisticclassifier, testing_features_df, testing_labels_df, - labelname, - labellevels, + singlelabelname, + singlelabellevels, ) PredictMD.open_plot(logistic_hist_testing) @@ -112,7 +135,7 @@ PredictMD.singlelabelbinaryclassificationmetrics( logisticclassifier, testing_features_df, testing_labels_df, - labelname, + singlelabelname, positiveclass; sensitivity = 0.95, ) @@ -121,7 +144,7 @@ PredictMD.singlelabelbinaryclassificationmetrics( logisticclassifier, testing_features_df, testing_labels_df, - labelname, + singlelabelname, positiveclass; sensitivity = 0.95, ) @@ -130,7 +153,7 @@ logistic_calibration_curve = PredictMD.plot_probability_calibration_curve( logisticclassifier, smoted_training_features_df, smoted_training_labels_df, - labelname, + singlelabelname, positiveclass; window = 0.2, ) @@ -140,7 +163,7 @@ PredictMD.probability_calibration_metrics( logisticclassifier, testing_features_df, testing_labels_df, - labelname, + singlelabelname, positiveclass; window = 0.1, ) @@ -149,7 +172,7 @@ logistic_cutoffs, logistic_risk_group_prevalences = PredictMD.risk_score_cutoff_ logisticclassifier, testing_features_df, testing_labels_df, - labelname, + singlelabelname, positiveclass; average_function = mean, ) @@ -165,7 +188,7 @@ logistic_cutoffs, logistic_risk_group_prevalences = PredictMD.risk_score_cutoff_ logisticclassifier, testing_features_df, testing_labels_df, - labelname, + singlelabelname, positiveclass; average_function = median, ) diff --git a/examples/breast_cancer_biopsy/nu_svc_svm_classifier.jl b/examples/breast_cancer_biopsy/nu_svc_svm_classifier.jl index e40c17270..9a464a0cf 100644 --- a/examples/breast_cancer_biopsy/nu_svc_svm_classifier.jl +++ b/examples/breast_cancer_biopsy/nu_svc_svm_classifier.jl @@ -67,16 +67,39 @@ smoted_training_labels_df = CSV.read( DataFrames.DataFrame, ) +categoricalfeaturenames = Symbol[] +continuousfeaturenames = Symbol[ + :V1, + :V2, + :V3, + :V4, + :V5, + :V6, + :V7, + :V8, + :V9, + ] +featurenames = vcat(categoricalfeaturenames, continuousfeaturenames) + +singlelabelname = :Class +negativeclass = "benign" +positiveclass = "malignant" +singlelabellevels = [negativeclass, positiveclass] ENV["nu_svc_svm_classifier_filename"] = string( tempname(), "nu_svc_svm_classifier.jld2", ) nu_svc_svm_classifier_filename = ENV["nu_svc_svm_classifier_filename"] +feature_contrasts = PredictMD.generate_feature_contrasts( + smoted_training_features_df, + featurenames, + ) + nusvc_svmclassifier = PredictMD.singlelabelmulticlassdataframesvmclassifier( featurenames, - labelname, - labellevels; + singlelabelname, + singlelabellevels; package = :LIBSVMjl, svmtype = LIBSVM.NuSVC, name = "SVM (nu-SVC)", @@ -94,8 +117,8 @@ nusvc_svmclassifier_hist_training = PredictMD.plotsinglelabelbinaryclassifierhis nusvc_svmclassifier, smoted_training_features_df, smoted_training_labels_df, - labelname, - labellevels, + singlelabelname, + singlelabellevels, ) PredictMD.open_plot(nusvc_svmclassifier_hist_training) @@ -103,8 +126,8 @@ nusvc_svmclassifier_hist_testing = PredictMD.plotsinglelabelbinaryclassifierhist nusvc_svmclassifier, testing_features_df, testing_labels_df, - labelname, - labellevels, + singlelabelname, + singlelabellevels, ) PredictMD.open_plot(nusvc_svmclassifier_hist_testing) @@ -112,7 +135,7 @@ PredictMD.singlelabelbinaryclassificationmetrics( nusvc_svmclassifier, smoted_training_features_df, smoted_training_labels_df, - labelname, + singlelabelname, positiveclass; sensitivity = 0.95, ) @@ -121,7 +144,7 @@ PredictMD.singlelabelbinaryclassificationmetrics( nusvc_svmclassifier, testing_features_df, testing_labels_df, - labelname, + singlelabelname, positiveclass; sensitivity = 0.95, ) diff --git a/examples/breast_cancer_biopsy/preprocess_data.jl b/examples/breast_cancer_biopsy/preprocess_data.jl index c71a08d75..f4b79d5ed 100644 --- a/examples/breast_cancer_biopsy/preprocess_data.jl +++ b/examples/breast_cancer_biopsy/preprocess_data.jl @@ -31,7 +31,7 @@ negativeclass = "benign" positiveclass = "malignant" singlelabellevels = [negativeclass, positiveclass] -labelnames = [singlelabelname +labelnames = [singlelabelname] features_df = df[featurenames] labels_df = df[labelnames] diff --git a/examples/breast_cancer_biopsy/random_forest_classifier.jl b/examples/breast_cancer_biopsy/random_forest_classifier.jl index bbd7739eb..f905d5137 100644 --- a/examples/breast_cancer_biopsy/random_forest_classifier.jl +++ b/examples/breast_cancer_biopsy/random_forest_classifier.jl @@ -67,16 +67,40 @@ smoted_training_labels_df = CSV.read( DataFrames.DataFrame, ) +categoricalfeaturenames = Symbol[] +continuousfeaturenames = Symbol[ + :V1, + :V2, + :V3, + :V4, + :V5, + :V6, + :V7, + :V8, + :V9, + ] +featurenames = vcat(categoricalfeaturenames, continuousfeaturenames) + +singlelabelname = :Class +negativeclass = "benign" +positiveclass = "malignant" +singlelabellevels = [negativeclass, positiveclass] + ENV["random_forest_classifier_filename"] = string( tempname(), "random_forest_classifier.jld2", ) random_forest_classifier_filename = ENV["random_forest_classifier_filename"] +feature_contrasts = PredictMD.generate_feature_contrasts( + smoted_training_features_df, + featurenames, + ) + rfclassifier = PredictMD.singlelabelmulticlassdataframerandomforestclassifier( featurenames, - labelname, - labellevels; + singlelabelname, + singlelabellevels; nsubfeatures = 4, # number of subfeatures; defaults to 2 ntrees = 200, # number of trees; defaults to 10 package = :DecisionTreejl, @@ -94,8 +118,8 @@ rfclassifier_hist_training = PredictMD.plotsinglelabelbinaryclassifierhistogram( rfclassifier, smoted_training_features_df, smoted_training_labels_df, - labelname, - labellevels, + singlelabelname, + singlelabellevels, ) PredictMD.open_plot(rfclassifier_hist_training) @@ -103,8 +127,8 @@ rfclassifier_hist_testing = PredictMD.plotsinglelabelbinaryclassifierhistogram( rfclassifier, testing_features_df, testing_labels_df, - labelname, - labellevels, + singlelabelname, + singlelabellevels, ) PredictMD.open_plot(rfclassifier_hist_testing) @@ -112,7 +136,7 @@ PredictMD.singlelabelbinaryclassificationmetrics( rfclassifier, smoted_training_features_df, smoted_training_labels_df, - labelname, + singlelabelname, positiveclass; sensitivity = 0.95, ) @@ -121,7 +145,7 @@ PredictMD.singlelabelbinaryclassificationmetrics( rfclassifier, testing_features_df, testing_labels_df, - labelname, + singlelabelname, positiveclass; sensitivity = 0.95, ) diff --git a/examples/breast_cancer_biopsy/smote.jl b/examples/breast_cancer_biopsy/smote.jl index 0a9841290..a1e80bd47 100644 --- a/examples/breast_cancer_biopsy/smote.jl +++ b/examples/breast_cancer_biopsy/smote.jl @@ -54,9 +54,27 @@ validation_labels_df = CSV.read( DataFrames.DataFrame, ) +categoricalfeaturenames = Symbol[] +continuousfeaturenames = Symbol[ + :V1, + :V2, + :V3, + :V4, + :V5, + :V6, + :V7, + :V8, + :V9, + ] +featurenames = vcat(categoricalfeaturenames, continuousfeaturenames) -DataFrames.describe(training_labels_df[labelname]) -StatsBase.countmap(training_labels_df[labelname]) +singlelabelname = :Class +negativeclass = "benign" +positiveclass = "malignant" +singlelabellevels = [negativeclass, positiveclass] + +DataFrames.describe(training_labels_df[singlelabelname]) +StatsBase.countmap(training_labels_df[singlelabelname]) majorityclass = "benign" minorityclass = "malignant" @@ -65,7 +83,7 @@ smoted_training_features_df, smoted_training_labels_df = PredictMD.smote( training_features_df, training_labels_df, featurenames, - labelname; + singlelabelname; majorityclass = majorityclass, minorityclass = minorityclass, pct_over = 100, # how much to oversample the minority class @@ -73,8 +91,8 @@ smoted_training_features_df, smoted_training_labels_df = PredictMD.smote( k = 5, ) -DataFrames.describe(smoted_training_labels_df[labelname]) -StatsBase.countmap(smoted_training_labels_df[labelname]) +DataFrames.describe(smoted_training_labels_df[singlelabelname]) +StatsBase.countmap(smoted_training_labels_df[singlelabelname]) ENV["smoted_training_features_df_filename"] = string( tempname(), From d67c55ac2f68035ee0aec7c54af577c7e570abdb Mon Sep 17 00:00:00 2001 From: Dilum Aluthge Date: Tue, 22 May 2018 01:03:47 -0400 Subject: [PATCH 41/62] Progress commit --- .../c_svc_svm_classifier.jl | 24 +++---- .../breast_cancer_biopsy/compare_models.jl | 33 +++++++--- .../breast_cancer_biopsy/get_model_output.jl | 63 +++++++++++-------- .../knet_mlp_classifier.jl | 32 +++++----- .../logistic_classifier.jl | 26 ++++---- .../nu_svc_svm_classifier.jl | 24 +++---- .../random_forest_classifier.jl | 24 +++---- src/io/saveload.jl | 8 +-- 8 files changed, 130 insertions(+), 104 deletions(-) diff --git a/examples/breast_cancer_biopsy/c_svc_svm_classifier.jl b/examples/breast_cancer_biopsy/c_svc_svm_classifier.jl index b2a88ecf6..4543049a8 100644 --- a/examples/breast_cancer_biopsy/c_svc_svm_classifier.jl +++ b/examples/breast_cancer_biopsy/c_svc_svm_classifier.jl @@ -63,7 +63,7 @@ smoted_training_features_df = CSV.read( DataFrames.DataFrame, ) smoted_training_labels_df = CSV.read( - smoted_training_features_df_filename, + smoted_training_labels_df_filename, DataFrames.DataFrame, ) @@ -96,7 +96,7 @@ feature_contrasts = PredictMD.generate_feature_contrasts( featurenames, ) -csvc_svmclassifier = PredictMD.singlelabelmulticlassdataframesvmclassifier( +c_svc_svm_classifier = PredictMD.singlelabelmulticlassdataframesvmclassifier( featurenames, singlelabelname, singlelabellevels; @@ -108,31 +108,31 @@ csvc_svmclassifier = PredictMD.singlelabelmulticlassdataframesvmclassifier( ) PredictMD.fit!( - csvc_svmclassifier, + c_svc_svm_classifier, smoted_training_features_df, smoted_training_labels_df, ) -csvc_svmclassifier_hist_training = PredictMD.plotsinglelabelbinaryclassifierhistogram( - csvc_svmclassifier, +c_svc_svm_classifier_hist_training = PredictMD.plotsinglelabelbinaryclassifierhistogram( + c_svc_svm_classifier, smoted_training_features_df, smoted_training_labels_df, singlelabelname, singlelabellevels, ) -PredictMD.open_plot(csvc_svmclassifier_hist_training) +PredictMD.open_plot(c_svc_svm_classifier_hist_training) -csvc_svmclassifier_hist_testing = PredictMD.plotsinglelabelbinaryclassifierhistogram( - csvc_svmclassifier, +c_svc_svm_classifier_hist_testing = PredictMD.plotsinglelabelbinaryclassifierhistogram( + c_svc_svm_classifier, testing_features_df, testing_labels_df, singlelabelname, singlelabellevels, ) -PredictMD.open_plot(csvc_svmclassifier_hist_testing) +PredictMD.open_plot(c_svc_svm_classifier_hist_testing) PredictMD.singlelabelbinaryclassificationmetrics( - csvc_svmclassifier, + c_svc_svm_classifier, smoted_training_features_df, smoted_training_labels_df, singlelabelname, @@ -141,7 +141,7 @@ PredictMD.singlelabelbinaryclassificationmetrics( ) PredictMD.singlelabelbinaryclassificationmetrics( - csvc_svmclassifier, + c_svc_svm_classifier, testing_features_df, testing_labels_df, singlelabelname, @@ -149,4 +149,4 @@ PredictMD.singlelabelbinaryclassificationmetrics( sensitivity = 0.95, ) -PredictMD.save_model(csvc_svmclassifier_filename, csvc_svmclassifier) +PredictMD.save_model(c_svc_svm_classifier_filename, c_svc_svm_classifier) diff --git a/examples/breast_cancer_biopsy/compare_models.jl b/examples/breast_cancer_biopsy/compare_models.jl index f43cc8c32..511eb07a1 100644 --- a/examples/breast_cancer_biopsy/compare_models.jl +++ b/examples/breast_cancer_biopsy/compare_models.jl @@ -54,10 +54,23 @@ validation_labels_df = CSV.read( DataFrames.DataFrame, ) -logisticclassifier = PredictMD.load_model(logisticclassifier_filename) -rfclassifier = PredictMD.load_model(rfclassifier_filename) -csvc_svmclassifier = PredictMD.load_model(csvc_svmclassifier_filename) -nusvc_svmclassifier = PredictMD.load_model(nusvc_svmclassifier_filename) +smoted_training_features_df_filename = + ENV["smoted_training_features_df_filename"] +smoted_training_labels_df_filename = + ENV["smoted_training_labels_df_filename"] +smoted_training_features_df = CSV.read( + smoted_training_features_df_filename, + DataFrames.DataFrame, + ) +smoted_training_labels_df = CSV.read( + smoted_training_labels_df_filename, + DataFrames.DataFrame, + ) + +logistic_classifier = PredictMD.load_model(logistic_classifier_filename) +random_forest_classifier = PredictMD.load_model(random_forest_classifier_filename) +c_svc_svm_classifier = PredictMD.load_model(c_svc_svm_classifier_filename) +nu_svc_svm_classifier = PredictMD.load_model(nu_svc_svm_classifier_filename) function knetmlp_predict( w, # don't put a type annotation on this @@ -107,14 +120,14 @@ function knetmlp_loss( return loss end -knetmlpclassifier = PredictMD.load_model(knetmlp_filename) +knet_mlp_classifier = PredictMD.load_model(knet_mlp_classifier_filename) all_models = PredictMD.Fittable[ - logisticclassifier, - rfclassifier, - csvc_svmclassifier, - nusvc_svmclassifier, - knetmlpclassifier, + logistic_classifier, + random_forest_classifier, + c_svc_svm_classifier, + nu_svc_svm_classifier, + knet_mlp_classifier, ] singlelabelname = :Class diff --git a/examples/breast_cancer_biopsy/get_model_output.jl b/examples/breast_cancer_biopsy/get_model_output.jl index 4e32699cb..3fbd023c3 100644 --- a/examples/breast_cancer_biopsy/get_model_output.jl +++ b/examples/breast_cancer_biopsy/get_model_output.jl @@ -54,10 +54,23 @@ validation_labels_df = CSV.read( DataFrames.DataFrame, ) -logisticclassifier = PredictMD.load_model(logisticclassifier_filename) -rfclassifier = PredictMD.load_model(rfclassifier_filename) -csvc_svmclassifier = PredictMD.load_model(csvc_svmclassifier_filename) -nusvc_svmclassifier = PredictMD.load_model(nusvc_svmclassifier_filename) +smoted_training_features_df_filename = + ENV["smoted_training_features_df_filename"] +smoted_training_labels_df_filename = + ENV["smoted_training_labels_df_filename"] +smoted_training_features_df = CSV.read( + smoted_training_features_df_filename, + DataFrames.DataFrame, + ) +smoted_training_labels_df = CSV.read( + smoted_training_labels_df_filename, + DataFrames.DataFrame, + ) + +logistic_classifier = PredictMD.load_model(logistic_classifier_filename) +random_forest_classifier = PredictMD.load_model(random_forest_classifier_filename) +c_svc_svm_classifier = PredictMD.load_model(c_svc_svm_classifier_filename) +nu_svc_svm_classifier = PredictMD.load_model(nu_svc_svm_classifier_filename) function knetmlp_predict( w, # don't put a type annotation on this @@ -107,28 +120,28 @@ function knetmlp_loss( return loss end -knetmlpclassifier = PredictMD.load_model(knetmlp_filename) +knet_mlp_classifier = PredictMD.load_model(knet_mlp_classifier_filename) -PredictMD.predict_proba(logisticclassifier,smoted_training_features_df,) -PredictMD.predict_proba(rfclassifier,smoted_training_features_df,) -PredictMD.predict_proba(csvc_svmclassifier,smoted_training_features_df,) -PredictMD.predict_proba(nusvc_svmclassifier,smoted_training_features_df,) -PredictMD.predict_proba(knetmlpclassifier,smoted_training_features_df,) +PredictMD.predict_proba(logistic_classifier,smoted_training_features_df,) +PredictMD.predict_proba(random_forest_classifier,smoted_training_features_df,) +PredictMD.predict_proba(c_svc_svm_classifier,smoted_training_features_df,) +PredictMD.predict_proba(nu_svc_svm_classifier,smoted_training_features_df,) +PredictMD.predict_proba(knet_mlp_classifier,smoted_training_features_df,) -PredictMD.predict_proba(logisticclassifier,testing_features_df,) -PredictMD.predict_proba(rfclassifier,testing_features_df,) -PredictMD.predict_proba(csvc_svmclassifier,testing_features_df,) -PredictMD.predict_proba(nusvc_svmclassifier,testing_features_df,) -PredictMD.predict_proba(knetmlpclassifier,testing_features_df,) +PredictMD.predict_proba(logistic_classifier,testing_features_df,) +PredictMD.predict_proba(random_forest_classifier,testing_features_df,) +PredictMD.predict_proba(c_svc_svm_classifier,testing_features_df,) +PredictMD.predict_proba(nu_svc_svm_classifier,testing_features_df,) +PredictMD.predict_proba(knet_mlp_classifier,testing_features_df,) -PredictMD.predict(logisticclassifier,smoted_training_features_df,) -PredictMD.predict(rfclassifier,smoted_training_features_df,) -PredictMD.predict(csvc_svmclassifier,smoted_training_features_df,) -PredictMD.predict(nusvc_svmclassifier,smoted_training_features_df,) -PredictMD.predict(knetmlpclassifier,smoted_training_features_df,) +PredictMD.predict(logistic_classifier,smoted_training_features_df,) +PredictMD.predict(random_forest_classifier,smoted_training_features_df,) +PredictMD.predict(c_svc_svm_classifier,smoted_training_features_df,) +PredictMD.predict(nu_svc_svm_classifier,smoted_training_features_df,) +PredictMD.predict(knet_mlp_classifier,smoted_training_features_df,) -PredictMD.predict(logisticclassifier,testing_features_df,) -PredictMD.predict(rfclassifier,testing_features_df,) -PredictMD.predict(csvc_svmclassifier,testing_features_df,) -PredictMD.predict(nusvc_svmclassifier,testing_features_df,) -PredictMD.predict(knetmlpclassifier,testing_features_df,) +PredictMD.predict(logistic_classifier,testing_features_df,) +PredictMD.predict(random_forest_classifier,testing_features_df,) +PredictMD.predict(c_svc_svm_classifier,testing_features_df,) +PredictMD.predict(nu_svc_svm_classifier,testing_features_df,) +PredictMD.predict(knet_mlp_classifier,testing_features_df,) diff --git a/examples/breast_cancer_biopsy/knet_mlp_classifier.jl b/examples/breast_cancer_biopsy/knet_mlp_classifier.jl index 77b6c60f2..f9b08aa3e 100644 --- a/examples/breast_cancer_biopsy/knet_mlp_classifier.jl +++ b/examples/breast_cancer_biopsy/knet_mlp_classifier.jl @@ -63,7 +63,7 @@ smoted_training_features_df = CSV.read( DataFrames.DataFrame, ) smoted_training_labels_df = CSV.read( - smoted_training_features_df_filename, + smoted_training_labels_df_filename, DataFrames.DataFrame, ) @@ -182,7 +182,7 @@ feature_contrasts = PredictMD.generate_feature_contrasts( featurenames, ) -knetmlpclassifier = PredictMD.singlelabelmulticlassdataframeknetclassifier( +knet_mlp_classifier = PredictMD.singlelabelmulticlassdataframeknetclassifier( featurenames, singlelabelname, singlelabellevels; @@ -201,7 +201,7 @@ knetmlpclassifier = PredictMD.singlelabelmulticlassdataframeknetclassifier( ) PredictMD.fit!( - knetmlpclassifier, + knet_mlp_classifier, smoted_training_features_df, smoted_training_labels_df, validation_features_df, @@ -209,13 +209,13 @@ PredictMD.fit!( ) knet_learningcurve_lossvsepoch = PredictMD.plotlearningcurve( - knetmlpclassifier, + knet_mlp_classifier, :loss_vs_epoch; ) PredictMD.open_plot(knet_learningcurve_lossvsepoch) knet_learningcurve_lossvsepoch_skip10epochs = PredictMD.plotlearningcurve( - knetmlpclassifier, + knet_mlp_classifier, :loss_vs_epoch; startat = 10, endat = :end, @@ -223,7 +223,7 @@ knet_learningcurve_lossvsepoch_skip10epochs = PredictMD.plotlearningcurve( PredictMD.open_plot(knet_learningcurve_lossvsepoch_skip10epochs) knet_learningcurve_lossvsiteration = PredictMD.plotlearningcurve( - knetmlpclassifier, + knet_mlp_classifier, :loss_vs_iteration; window = 50, sampleevery = 10, @@ -231,7 +231,7 @@ knet_learningcurve_lossvsiteration = PredictMD.plotlearningcurve( PredictMD.open_plot(knet_learningcurve_lossvsiteration) knet_learningcurve_lossvsiteration_skip100iterations = PredictMD.plotlearningcurve( - knetmlpclassifier, + knet_mlp_classifier, :loss_vs_iteration; window = 50, sampleevery = 10, @@ -240,26 +240,26 @@ knet_learningcurve_lossvsiteration_skip100iterations = PredictMD.plotlearningcur ) PredictMD.open_plot(knet_learningcurve_lossvsiteration_skip100iterations) -knetmlpclassifier_hist_training = PredictMD.plotsinglelabelbinaryclassifierhistogram( - knetmlpclassifier, +knet_mlp_classifier_hist_training = PredictMD.plotsinglelabelbinaryclassifierhistogram( + knet_mlp_classifier, smoted_training_features_df, smoted_training_labels_df, singlelabelname, singlelabellevels, ) -PredictMD.open_plot(knetmlpclassifier_hist_training) +PredictMD.open_plot(knet_mlp_classifier_hist_training) -knetmlpclassifier_hist_testing = PredictMD.plotsinglelabelbinaryclassifierhistogram( - knetmlpclassifier, +knet_mlp_classifier_hist_testing = PredictMD.plotsinglelabelbinaryclassifierhistogram( + knet_mlp_classifier, testing_features_df, testing_labels_df, singlelabelname, singlelabellevels, ) -PredictMD.open_plot(knetmlpclassifier_hist_testing) +PredictMD.open_plot(knet_mlp_classifier_hist_testing) PredictMD.singlelabelbinaryclassificationmetrics( - knetmlpclassifier, + knet_mlp_classifier, smoted_training_features_df, smoted_training_labels_df, singlelabelname, @@ -268,7 +268,7 @@ PredictMD.singlelabelbinaryclassificationmetrics( ) PredictMD.singlelabelbinaryclassificationmetrics( - knetmlpclassifier, + knet_mlp_classifier, testing_features_df, testing_labels_df, singlelabelname, @@ -276,4 +276,4 @@ PredictMD.singlelabelbinaryclassificationmetrics( sensitivity = 0.95, ) -PredictMD.save_model(knetmlp_filename, knetmlpclassifier) +PredictMD.save_model(knet_mlp_classifier_filename, knet_mlp_classifier) diff --git a/examples/breast_cancer_biopsy/logistic_classifier.jl b/examples/breast_cancer_biopsy/logistic_classifier.jl index 4b4c6657b..a34c18585 100644 --- a/examples/breast_cancer_biopsy/logistic_classifier.jl +++ b/examples/breast_cancer_biopsy/logistic_classifier.jl @@ -62,7 +62,7 @@ smoted_training_features_df = CSV.read( DataFrames.DataFrame, ) smoted_training_labels_df = CSV.read( - smoted_training_features_df_filename, + smoted_training_labels_df_filename, DataFrames.DataFrame, ) @@ -95,7 +95,7 @@ feature_contrasts = PredictMD.generate_feature_contrasts( featurenames, ) -logisticclassifier = PredictMD.singlelabelbinaryclassdataframelogisticclassifier( +logistic_classifier = PredictMD.singlelabelbinaryclassdataframelogistic_classifier( featurenames, singlelabelname, singlelabellevels; @@ -106,15 +106,15 @@ logisticclassifier = PredictMD.singlelabelbinaryclassdataframelogisticclassifier ) PredictMD.fit!( - logisticclassifier, + logistic_classifier, smoted_training_features_df, smoted_training_labels_df, ) -PredictMD.get_underlying(logisticclassifier) +PredictMD.get_underlying(logistic_classifier) logistic_hist_training = PredictMD.plotsinglelabelbinaryclassifierhistogram( - logisticclassifier, + logistic_classifier, smoted_training_features_df, smoted_training_labels_df, singlelabelname, @@ -123,7 +123,7 @@ logistic_hist_training = PredictMD.plotsinglelabelbinaryclassifierhistogram( PredictMD.open_plot(logistic_hist_training) logistic_hist_testing = PredictMD.plotsinglelabelbinaryclassifierhistogram( - logisticclassifier, + logistic_classifier, testing_features_df, testing_labels_df, singlelabelname, @@ -132,7 +132,7 @@ logistic_hist_testing = PredictMD.plotsinglelabelbinaryclassifierhistogram( PredictMD.open_plot(logistic_hist_testing) PredictMD.singlelabelbinaryclassificationmetrics( - logisticclassifier, + logistic_classifier, testing_features_df, testing_labels_df, singlelabelname, @@ -141,7 +141,7 @@ PredictMD.singlelabelbinaryclassificationmetrics( ) PredictMD.singlelabelbinaryclassificationmetrics( - logisticclassifier, + logistic_classifier, testing_features_df, testing_labels_df, singlelabelname, @@ -150,7 +150,7 @@ PredictMD.singlelabelbinaryclassificationmetrics( ) logistic_calibration_curve = PredictMD.plot_probability_calibration_curve( - logisticclassifier, + logistic_classifier, smoted_training_features_df, smoted_training_labels_df, singlelabelname, @@ -160,7 +160,7 @@ logistic_calibration_curve = PredictMD.plot_probability_calibration_curve( PredictMD.open_plot(logistic_calibration_curve) PredictMD.probability_calibration_metrics( - logisticclassifier, + logistic_classifier, testing_features_df, testing_labels_df, singlelabelname, @@ -169,7 +169,7 @@ PredictMD.probability_calibration_metrics( ) logistic_cutoffs, logistic_risk_group_prevalences = PredictMD.risk_score_cutoff_values( - logisticclassifier, + logistic_classifier, testing_features_df, testing_labels_df, singlelabelname, @@ -185,7 +185,7 @@ println( ) showall(logistic_risk_group_prevalences) logistic_cutoffs, logistic_risk_group_prevalences = PredictMD.risk_score_cutoff_values( - logisticclassifier, + logistic_classifier, testing_features_df, testing_labels_df, singlelabelname, @@ -201,4 +201,4 @@ println( ) showall(logistic_risk_group_prevalences) -PredictMD.save_model(logisticclassifier_filename, logisticclassifier) +PredictMD.save_model(logistic_classifier_filename, logistic_classifier) diff --git a/examples/breast_cancer_biopsy/nu_svc_svm_classifier.jl b/examples/breast_cancer_biopsy/nu_svc_svm_classifier.jl index 9a464a0cf..77e0b88b9 100644 --- a/examples/breast_cancer_biopsy/nu_svc_svm_classifier.jl +++ b/examples/breast_cancer_biopsy/nu_svc_svm_classifier.jl @@ -63,7 +63,7 @@ smoted_training_features_df = CSV.read( DataFrames.DataFrame, ) smoted_training_labels_df = CSV.read( - smoted_training_features_df_filename, + smoted_training_labels_df_filename, DataFrames.DataFrame, ) @@ -96,7 +96,7 @@ feature_contrasts = PredictMD.generate_feature_contrasts( featurenames, ) -nusvc_svmclassifier = PredictMD.singlelabelmulticlassdataframesvmclassifier( +nu_svc_svm_classifier = PredictMD.singlelabelmulticlassdataframesvmclassifier( featurenames, singlelabelname, singlelabellevels; @@ -108,31 +108,31 @@ nusvc_svmclassifier = PredictMD.singlelabelmulticlassdataframesvmclassifier( ) PredictMD.fit!( - nusvc_svmclassifier, + nu_svc_svm_classifier, smoted_training_features_df, smoted_training_labels_df, ) -nusvc_svmclassifier_hist_training = PredictMD.plotsinglelabelbinaryclassifierhistogram( - nusvc_svmclassifier, +nu_svc_svm_classifier_hist_training = PredictMD.plotsinglelabelbinaryclassifierhistogram( + nu_svc_svm_classifier, smoted_training_features_df, smoted_training_labels_df, singlelabelname, singlelabellevels, ) -PredictMD.open_plot(nusvc_svmclassifier_hist_training) +PredictMD.open_plot(nu_svc_svm_classifier_hist_training) -nusvc_svmclassifier_hist_testing = PredictMD.plotsinglelabelbinaryclassifierhistogram( - nusvc_svmclassifier, +nu_svc_svm_classifier_hist_testing = PredictMD.plotsinglelabelbinaryclassifierhistogram( + nu_svc_svm_classifier, testing_features_df, testing_labels_df, singlelabelname, singlelabellevels, ) -PredictMD.open_plot(nusvc_svmclassifier_hist_testing) +PredictMD.open_plot(nu_svc_svm_classifier_hist_testing) PredictMD.singlelabelbinaryclassificationmetrics( - nusvc_svmclassifier, + nu_svc_svm_classifier, smoted_training_features_df, smoted_training_labels_df, singlelabelname, @@ -141,7 +141,7 @@ PredictMD.singlelabelbinaryclassificationmetrics( ) PredictMD.singlelabelbinaryclassificationmetrics( - nusvc_svmclassifier, + nu_svc_svm_classifier, testing_features_df, testing_labels_df, singlelabelname, @@ -149,4 +149,4 @@ PredictMD.singlelabelbinaryclassificationmetrics( sensitivity = 0.95, ) -PredictMD.save_model(nusvc_svmclassifier_filename, nusvc_svmclassifier) +PredictMD.save_model(nu_svc_svm_classifier_filename, nu_svc_svm_classifier) diff --git a/examples/breast_cancer_biopsy/random_forest_classifier.jl b/examples/breast_cancer_biopsy/random_forest_classifier.jl index f905d5137..31cc33b6a 100644 --- a/examples/breast_cancer_biopsy/random_forest_classifier.jl +++ b/examples/breast_cancer_biopsy/random_forest_classifier.jl @@ -63,7 +63,7 @@ smoted_training_features_df = CSV.read( DataFrames.DataFrame, ) smoted_training_labels_df = CSV.read( - smoted_training_features_df_filename, + smoted_training_labels_df_filename, DataFrames.DataFrame, ) @@ -97,7 +97,7 @@ feature_contrasts = PredictMD.generate_feature_contrasts( featurenames, ) -rfclassifier = PredictMD.singlelabelmulticlassdataframerandomforestclassifier( +random_forest_classifier = PredictMD.singlelabelmulticlassdataframerandomforestclassifier( featurenames, singlelabelname, singlelabellevels; @@ -109,31 +109,31 @@ rfclassifier = PredictMD.singlelabelmulticlassdataframerandomforestclassifier( ) PredictMD.fit!( - rfclassifier, + random_forest_classifier, smoted_training_features_df, smoted_training_labels_df, ) -rfclassifier_hist_training = PredictMD.plotsinglelabelbinaryclassifierhistogram( - rfclassifier, +random_forest_classifier_hist_training = PredictMD.plotsinglelabelbinaryclassifierhistogram( + random_forest_classifier, smoted_training_features_df, smoted_training_labels_df, singlelabelname, singlelabellevels, ) -PredictMD.open_plot(rfclassifier_hist_training) +PredictMD.open_plot(random_forest_classifier_hist_training) -rfclassifier_hist_testing = PredictMD.plotsinglelabelbinaryclassifierhistogram( - rfclassifier, +random_forest_classifier_hist_testing = PredictMD.plotsinglelabelbinaryclassifierhistogram( + random_forest_classifier, testing_features_df, testing_labels_df, singlelabelname, singlelabellevels, ) -PredictMD.open_plot(rfclassifier_hist_testing) +PredictMD.open_plot(random_forest_classifier_hist_testing) PredictMD.singlelabelbinaryclassificationmetrics( - rfclassifier, + random_forest_classifier, smoted_training_features_df, smoted_training_labels_df, singlelabelname, @@ -142,7 +142,7 @@ PredictMD.singlelabelbinaryclassificationmetrics( ) PredictMD.singlelabelbinaryclassificationmetrics( - rfclassifier, + random_forest_classifier, testing_features_df, testing_labels_df, singlelabelname, @@ -150,4 +150,4 @@ PredictMD.singlelabelbinaryclassificationmetrics( sensitivity = 0.95, ) -PredictMD.save_model(rfclassifier_filename, rfclassifier) +PredictMD.save_model(random_forest_classifier_filename, random_forest_classifier) diff --git a/src/io/saveload.jl b/src/io/saveload.jl index 60c0a79b0..e96a6d4ee 100644 --- a/src/io/saveload.jl +++ b/src/io/saveload.jl @@ -18,13 +18,13 @@ function save_model(filename::AbstractString,fittable_object_to_save::Fittable) dict_of_objects_to_save = Dict( "saved_model" => fittable_object_to_save, ) - info("DEBUG Attempting to save model...") + info("INFO Attempting to save model...") # make sure the parent directory exists parent_directory = Base.Filesystem.dirname(filename) Base.Filesystem.mkpath(parent_directory) # save the .jld2 file FileIO.save(filename, dict_of_objects_to_save) - info(string("DEBUG Saved model to file \"", filename, "\"")) + info(string("INFO Saved model to file \"", filename, "\"")) return nothing end @@ -40,9 +40,9 @@ function load_model(filename::AbstractString) "\" does not end in \".jld2\"") ) end - info("DEBUG Attempting to load model...") + info("INFO Attempting to load model...") dict_of_loaded_objects = FileIO.load(filename) loaded_fittable_object = dict_of_loaded_objects["saved_model"] - info(string("DEBUG Loaded model from file \"", filename, "\"")) + info(string("INFO Loaded model from file \"", filename, "\"")) return loaded_fittable_object end From 49b57a251f36048cfcc0330498690fd7d8b9f71a Mon Sep 17 00:00:00 2001 From: Dilum Aluthge Date: Tue, 22 May 2018 01:05:22 -0400 Subject: [PATCH 42/62] Rename files --- .../{preprocess_data.jl => 01_preprocess_data.jl} | 0 examples/breast_cancer_biopsy/{smote.jl => 02_smote.jl} | 0 .../{logistic_classifier.jl => 03_logistic_classifier.jl} | 0 ...random_forest_classifier.jl => 04_random_forest_classifier.jl} | 0 .../{c_svc_svm_classifier.jl => 05_c_svc_svm_classifier.jl} | 0 .../{nu_svc_svm_classifier.jl => 06_nu_svc_svm_classifier.jl} | 0 .../{knet_mlp_classifier.jl => 07_knet_mlp_classifier.jl} | 0 .../{compare_models.jl => 08_compare_models.jl} | 0 .../{get_model_output.jl => 09_get_model_output.jl} | 0 9 files changed, 0 insertions(+), 0 deletions(-) rename examples/breast_cancer_biopsy/{preprocess_data.jl => 01_preprocess_data.jl} (100%) rename examples/breast_cancer_biopsy/{smote.jl => 02_smote.jl} (100%) rename examples/breast_cancer_biopsy/{logistic_classifier.jl => 03_logistic_classifier.jl} (100%) rename examples/breast_cancer_biopsy/{random_forest_classifier.jl => 04_random_forest_classifier.jl} (100%) rename examples/breast_cancer_biopsy/{c_svc_svm_classifier.jl => 05_c_svc_svm_classifier.jl} (100%) rename examples/breast_cancer_biopsy/{nu_svc_svm_classifier.jl => 06_nu_svc_svm_classifier.jl} (100%) rename examples/breast_cancer_biopsy/{knet_mlp_classifier.jl => 07_knet_mlp_classifier.jl} (100%) rename examples/breast_cancer_biopsy/{compare_models.jl => 08_compare_models.jl} (100%) rename examples/breast_cancer_biopsy/{get_model_output.jl => 09_get_model_output.jl} (100%) diff --git a/examples/breast_cancer_biopsy/preprocess_data.jl b/examples/breast_cancer_biopsy/01_preprocess_data.jl similarity index 100% rename from examples/breast_cancer_biopsy/preprocess_data.jl rename to examples/breast_cancer_biopsy/01_preprocess_data.jl diff --git a/examples/breast_cancer_biopsy/smote.jl b/examples/breast_cancer_biopsy/02_smote.jl similarity index 100% rename from examples/breast_cancer_biopsy/smote.jl rename to examples/breast_cancer_biopsy/02_smote.jl diff --git a/examples/breast_cancer_biopsy/logistic_classifier.jl b/examples/breast_cancer_biopsy/03_logistic_classifier.jl similarity index 100% rename from examples/breast_cancer_biopsy/logistic_classifier.jl rename to examples/breast_cancer_biopsy/03_logistic_classifier.jl diff --git a/examples/breast_cancer_biopsy/random_forest_classifier.jl b/examples/breast_cancer_biopsy/04_random_forest_classifier.jl similarity index 100% rename from examples/breast_cancer_biopsy/random_forest_classifier.jl rename to examples/breast_cancer_biopsy/04_random_forest_classifier.jl diff --git a/examples/breast_cancer_biopsy/c_svc_svm_classifier.jl b/examples/breast_cancer_biopsy/05_c_svc_svm_classifier.jl similarity index 100% rename from examples/breast_cancer_biopsy/c_svc_svm_classifier.jl rename to examples/breast_cancer_biopsy/05_c_svc_svm_classifier.jl diff --git a/examples/breast_cancer_biopsy/nu_svc_svm_classifier.jl b/examples/breast_cancer_biopsy/06_nu_svc_svm_classifier.jl similarity index 100% rename from examples/breast_cancer_biopsy/nu_svc_svm_classifier.jl rename to examples/breast_cancer_biopsy/06_nu_svc_svm_classifier.jl diff --git a/examples/breast_cancer_biopsy/knet_mlp_classifier.jl b/examples/breast_cancer_biopsy/07_knet_mlp_classifier.jl similarity index 100% rename from examples/breast_cancer_biopsy/knet_mlp_classifier.jl rename to examples/breast_cancer_biopsy/07_knet_mlp_classifier.jl diff --git a/examples/breast_cancer_biopsy/compare_models.jl b/examples/breast_cancer_biopsy/08_compare_models.jl similarity index 100% rename from examples/breast_cancer_biopsy/compare_models.jl rename to examples/breast_cancer_biopsy/08_compare_models.jl diff --git a/examples/breast_cancer_biopsy/get_model_output.jl b/examples/breast_cancer_biopsy/09_get_model_output.jl similarity index 100% rename from examples/breast_cancer_biopsy/get_model_output.jl rename to examples/breast_cancer_biopsy/09_get_model_output.jl From 80d182fe4d0011b8200bf34df1a0e357fa56f63f Mon Sep 17 00:00:00 2001 From: Dilum Aluthge Date: Tue, 22 May 2018 01:09:57 -0400 Subject: [PATCH 43/62] Progress commit --- docs/make_docs.jl | 90 +++++++++++++++++++++++------------------------ docs/mkdocs.yml | 1 - 2 files changed, 45 insertions(+), 46 deletions(-) diff --git a/docs/make_docs.jl b/docs/make_docs.jl index 5f2f04ce8..5b69547d4 100644 --- a/docs/make_docs.jl +++ b/docs/make_docs.jl @@ -28,108 +28,108 @@ boston_housing_output_directory = joinpath( ) Literate.markdown( - joinpath(boston_housing_input_directory, ""), + joinpath(boston_housing_input_directory, "01_preprocess_data.jl"), boston_housing_output_directory; documenter = true, ) Literate.notebook( - joinpath(boston_housing_input_directory, ""), + joinpath(boston_housing_input_directory, "01_preprocess_data.jl"), boston_housing_output_directory; documenter = true, execute = false, ) Literate.script( - joinpath(boston_housing_input_directory, ""), + joinpath(boston_housing_input_directory, "01_preprocess_data.jl"), boston_housing_output_directory; documenter = true, keep_comments = true, ) # Literate.markdown( - joinpath(boston_housing_input_directory, ""), + joinpath(boston_housing_input_directory, "02_linear_regression.jl"), boston_housing_output_directory; documenter = true, ) Literate.notebook( - joinpath(boston_housing_input_directory, ""), + joinpath(boston_housing_input_directory, "02_linear_regression.jl"), boston_housing_output_directory; documenter = true, execute = false, ) Literate.script( - joinpath(boston_housing_input_directory, ""), + joinpath(boston_housing_input_directory, "02_linear_regression.jl"), boston_housing_output_directory; documenter = true, keep_comments = true, ) # Literate.markdown( - joinpath(boston_housing_input_directory, ""), + joinpath(boston_housing_input_directory, "03_random_forest_regression.jl"), boston_housing_output_directory; documenter = true, ) Literate.notebook( - joinpath(boston_housing_input_directory, ""), + joinpath(boston_housing_input_directory, "03_random_forest_regression.jl"), boston_housing_output_directory; documenter = true, execute = false, ) Literate.script( - joinpath(boston_housing_input_directory, ""), + joinpath(boston_housing_input_directory, "03_random_forest_regression.jl"), boston_housing_output_directory; documenter = true, keep_comments = true, ) # Literate.markdown( - joinpath(boston_housing_input_directory, ""), + joinpath(boston_housing_input_directory, "04_knet_mlp_regression.jl"), boston_housing_output_directory; documenter = true, ) Literate.notebook( - joinpath(boston_housing_input_directory, ""), + joinpath(boston_housing_input_directory, "04_knet_mlp_regression.jl"), boston_housing_output_directory; documenter = true, execute = false, ) Literate.script( - joinpath(boston_housing_input_directory, ""), + joinpath(boston_housing_input_directory, "04_knet_mlp_regression.jl"), boston_housing_output_directory; documenter = true, keep_comments = true, ) # Literate.markdown( - joinpath(boston_housing_input_directory, ""), + joinpath(boston_housing_input_directory, "05_compare_models.jl"), boston_housing_output_directory; documenter = true, ) Literate.notebook( - joinpath(boston_housing_input_directory, ""), + joinpath(boston_housing_input_directory, "05_compare_models.jl"), boston_housing_output_directory; documenter = true, execute = false, ) Literate.script( - joinpath(boston_housing_input_directory, ""), + joinpath(boston_housing_input_directory, "05_compare_models.jl"), boston_housing_output_directory; documenter = true, keep_comments = true, ) # Literate.markdown( - joinpath(boston_housing_input_directory, ""), + joinpath(boston_housing_input_directory, "06_get_model_output.jl"), boston_housing_output_directory; documenter = true, ) Literate.notebook( - joinpath(boston_housing_input_directory, ""), + joinpath(boston_housing_input_directory, "06_get_model_output.jl"), boston_housing_output_directory; documenter = true, execute = false, ) Literate.script( - joinpath(boston_housing_input_directory, ""), + joinpath(boston_housing_input_directory, "06_get_model_output.jl"), boston_housing_output_directory; documenter = true, keep_comments = true, @@ -145,162 +145,162 @@ breast_cancer_biopsy_output_directory = joinpath( ) Literate.markdown( - joinpath(breast_cancer_biopsy_input_directory, ""), + joinpath(breast_cancer_biopsy_input_directory, "01_preprocess_data.jl"), breast_cancer_biopsy_output_directory; documenter = true, ) Literate.notebook( - joinpath(breast_cancer_biopsy_input_directory, ""), + joinpath(breast_cancer_biopsy_input_directory, "01_preprocess_data.jl"), breast_cancer_biopsy_output_directory; documenter = true, execute = false, ) Literate.script( - joinpath(breast_cancer_biopsy_input_directory, ""), + joinpath(breast_cancer_biopsy_input_directory, "01_preprocess_data.jl"), breast_cancer_biopsy_output_directory; documenter = true, keep_comments = true, ) # Literate.markdown( - joinpath(breast_cancer_biopsy_input_directory, ""), + joinpath(breast_cancer_biopsy_input_directory, "02_smote.jl"), breast_cancer_biopsy_output_directory; documenter = true, ) Literate.notebook( - joinpath(breast_cancer_biopsy_input_directory, ""), + joinpath(breast_cancer_biopsy_input_directory, "02_smote.jl"), breast_cancer_biopsy_output_directory; documenter = true, execute = false, ) Literate.script( - joinpath(breast_cancer_biopsy_input_directory, ""), + joinpath(breast_cancer_biopsy_input_directory, "02_smote.jl"), breast_cancer_biopsy_output_directory; documenter = true, keep_comments = true, ) # Literate.markdown( - joinpath(breast_cancer_biopsy_input_directory, ""), + joinpath(breast_cancer_biopsy_input_directory, "03_logistic_classifier.jl"), breast_cancer_biopsy_output_directory; documenter = true, ) Literate.notebook( - joinpath(breast_cancer_biopsy_input_directory, ""), + joinpath(breast_cancer_biopsy_input_directory, "03_logistic_classifier.jl"), breast_cancer_biopsy_output_directory; documenter = true, execute = false, ) Literate.script( - joinpath(breast_cancer_biopsy_input_directory, ""), + joinpath(breast_cancer_biopsy_input_directory, "03_logistic_classifier.jl"), breast_cancer_biopsy_output_directory; documenter = true, keep_comments = true, ) # Literate.markdown( - joinpath(breast_cancer_biopsy_input_directory, ""), + joinpath(breast_cancer_biopsy_input_directory, "04_random_forest_classifier.jl"), breast_cancer_biopsy_output_directory; documenter = true, ) Literate.notebook( - joinpath(breast_cancer_biopsy_input_directory, ""), + joinpath(breast_cancer_biopsy_input_directory, "04_random_forest_classifier.jl"), breast_cancer_biopsy_output_directory; documenter = true, execute = false, ) Literate.script( - joinpath(breast_cancer_biopsy_input_directory, ""), + joinpath(breast_cancer_biopsy_input_directory, "04_random_forest_classifier.jl"), breast_cancer_biopsy_output_directory; documenter = true, keep_comments = true, ) # Literate.markdown( - joinpath(breast_cancer_biopsy_input_directory, ""), + joinpath(breast_cancer_biopsy_input_directory, "05_c_svc_svm_classifier.jl"), breast_cancer_biopsy_output_directory; documenter = true, ) Literate.notebook( - joinpath(breast_cancer_biopsy_input_directory, ""), + joinpath(breast_cancer_biopsy_input_directory, "05_c_svc_svm_classifier.jl"), breast_cancer_biopsy_output_directory; documenter = true, execute = false, ) Literate.script( - joinpath(breast_cancer_biopsy_input_directory, ""), + joinpath(breast_cancer_biopsy_input_directory, "05_c_svc_svm_classifier.jl"), breast_cancer_biopsy_output_directory; documenter = true, keep_comments = true, ) # Literate.markdown( - joinpath(breast_cancer_biopsy_input_directory, ""), + joinpath(breast_cancer_biopsy_input_directory, "06_nu_svc_svm_classifier.jl"), breast_cancer_biopsy_output_directory; documenter = true, ) Literate.notebook( - joinpath(breast_cancer_biopsy_input_directory, ""), + joinpath(breast_cancer_biopsy_input_directory, "06_nu_svc_svm_classifier.jl"), breast_cancer_biopsy_output_directory; documenter = true, execute = false, ) Literate.script( - joinpath(breast_cancer_biopsy_input_directory, ""), + joinpath(breast_cancer_biopsy_input_directory, "06_nu_svc_svm_classifier.jl"), breast_cancer_biopsy_output_directory; documenter = true, keep_comments = true, ) # Literate.markdown( - joinpath(breast_cancer_biopsy_input_directory, ""), + joinpath(breast_cancer_biopsy_input_directory, "07_knet_mlp_classifier.jl"), breast_cancer_biopsy_output_directory; documenter = true, ) Literate.notebook( - joinpath(breast_cancer_biopsy_input_directory, ""), + joinpath(breast_cancer_biopsy_input_directory, "07_knet_mlp_classifier.jl"), breast_cancer_biopsy_output_directory; documenter = true, execute = false, ) Literate.script( - joinpath(breast_cancer_biopsy_input_directory, ""), + joinpath(breast_cancer_biopsy_input_directory, "07_knet_mlp_classifier.jl"), breast_cancer_biopsy_output_directory; documenter = true, keep_comments = true, ) # Literate.markdown( - joinpath(breast_cancer_biopsy_input_directory, ""), + joinpath(breast_cancer_biopsy_input_directory, "08_compare_models.jl"), breast_cancer_biopsy_output_directory; documenter = true, ) Literate.notebook( - joinpath(breast_cancer_biopsy_input_directory, ""), + joinpath(breast_cancer_biopsy_input_directory, "08_compare_models.jl"), breast_cancer_biopsy_output_directory; documenter = true, execute = false, ) Literate.script( - joinpath(breast_cancer_biopsy_input_directory, ""), + joinpath(breast_cancer_biopsy_input_directory, "08_compare_models.jl"), breast_cancer_biopsy_output_directory; documenter = true, keep_comments = true, ) # Literate.markdown( - joinpath(breast_cancer_biopsy_input_directory, ""), + joinpath(breast_cancer_biopsy_input_directory, "09_get_model_output.jl"), breast_cancer_biopsy_output_directory; documenter = true, ) Literate.notebook( - joinpath(breast_cancer_biopsy_input_directory, ""), + joinpath(breast_cancer_biopsy_input_directory, "09_get_model_output.jl"), breast_cancer_biopsy_output_directory; documenter = true, execute = false, ) Literate.script( - joinpath(breast_cancer_biopsy_input_directory, ""), + joinpath(breast_cancer_biopsy_input_directory, "09_get_model_output.jl"), breast_cancer_biopsy_output_directory; documenter = true, keep_comments = true, diff --git a/docs/mkdocs.yml b/docs/mkdocs.yml index 87e73e7b1..453cd9ee5 100644 --- a/docs/mkdocs.yml +++ b/docs/mkdocs.yml @@ -23,6 +23,5 @@ docs_dir: 'build' pages: - Home: index.md - Examples: examples.md - - Boy: examples/boston_housing/boston_housing.md - Library: - 'Internals': 'library/internals.md' From 120f0f609c4ffa3caac20f749e5fa693469e3c07 Mon Sep 17 00:00:00 2001 From: Dilum Aluthge Date: Tue, 22 May 2018 01:25:40 -0400 Subject: [PATCH 44/62] Eliminate ENV --- examples/boston_housing/01_preprocess_data.jl | 80 ++++++++----------- .../boston_housing/02_linear_regression.jl | 55 ++++++++----- .../03_random_forest_regression.jl | 56 ++++++++----- .../boston_housing/04_knet_mlp_regression.jl | 55 ++++++++----- examples/boston_housing/05_compare_models.jl | 63 ++++++++++----- .../boston_housing/06_get_model_output.jl | 63 ++++++++++----- 6 files changed, 226 insertions(+), 146 deletions(-) diff --git a/examples/boston_housing/01_preprocess_data.jl b/examples/boston_housing/01_preprocess_data.jl index 8d1406ca6..1a180160d 100644 --- a/examples/boston_housing/01_preprocess_data.jl +++ b/examples/boston_housing/01_preprocess_data.jl @@ -65,54 +65,38 @@ training_features_df, 2/3, # 2/3 of 75% = 50% training, 1/3 of 75% = 25% validation ) -ENV["trainingandvalidation_features_df_filename"] = string( - tempname(), - "_trainingandvalidation_features_df.csv", - ) -ENV["trainingandvalidation_labels_df_filename"] = string( - tempname(), - "_trainingandvalidation_labels_df.csv", - ) -ENV["testing_features_df_filename"] = string( - tempname(), - "_testing_features_df.csv", - ) -ENV["testing_labels_df_filename"] = string( - tempname(), - "_.testing_labels_dfcsv", - ) -ENV["training_features_df_filename"] = string( - tempname(), - "_training_features_df.csv", - ) -ENV["training_labels_df_filename"] = string( - tempname(), - "_training_labels_df.csv", - ) -ENV["validation_features_df_filename"] = string( - tempname(), - "_validation_features_df.csv", - ) -ENV["validation_labels_df_filename"] = string( - tempname(), - "_validation_labels_df.csv", - ) -trainingandvalidation_features_df_filename = - ENV["trainingandvalidation_features_df_filename"] -trainingandvalidation_labels_df_filename = - ENV["trainingandvalidation_labels_df_filename"] -testing_features_df_filename = - ENV["testing_features_df_filename"] -testing_labels_df_filename = - ENV["testing_labels_df_filename"] -training_features_df_filename = - ENV["training_features_df_filename"] -training_labels_df_filename = - ENV["training_labels_df_filename"] -validation_features_df_filename = - ENV["validation_features_df_filename"] -validation_labels_df_filename = - ENV["validation_labels_df_filename"] +trainingandvalidation_features_df_filename = joinpath( + tempdir(), + "trainingandvalidation_features_df.csv", + ) +trainingandvalidation_labels_df_filename = joinpath( + tempdir(), + "trainingandvalidation_labels_df.csv", + ) +testing_features_df_filename = joinpath( + tempdir(), + "testing_features_df.csv", + ) +testing_labels_df_filename = joinpath( + tempdir(), + "testing_labels_df.csv", + ) +training_features_df_filename = joinpath( + tempdir(), + "training_features_df.csv", + ) +training_labels_df_filename = joinpath( + tempdir(), + "training_labels_df.csv", + ) +validation_features_df_filename = joinpath( + tempdir(), + "validation_features_df.csv", + ) +validation_labels_df_filename = joinpath( + tempdir(), + "validation_labels_df.csv", + ) CSV.write( trainingandvalidation_features_df_filename, trainingandvalidation_features_df, diff --git a/examples/boston_housing/02_linear_regression.jl b/examples/boston_housing/02_linear_regression.jl index 22fe04cbe..839be22b3 100644 --- a/examples/boston_housing/02_linear_regression.jl +++ b/examples/boston_housing/02_linear_regression.jl @@ -4,22 +4,38 @@ import CSV import DataFrames import PredictMD -trainingandvalidation_features_df_filename = - ENV["trainingandvalidation_features_df_filename"] -trainingandvalidation_labels_df_filename = - ENV["trainingandvalidation_labels_df_filename"] -testing_features_df_filename = - ENV["testing_features_df_filename"] -testing_labels_df_filename = - ENV["testing_labels_df_filename"] -training_features_df_filename = - ENV["training_features_df_filename"] -training_labels_df_filename = - ENV["training_labels_df_filename"] -validation_features_df_filename = - ENV["validation_features_df_filename"] -validation_labels_df_filename = - ENV["validation_labels_df_filename"] +trainingandvalidation_features_df_filename = joinpath( + tempdir(), + "trainingandvalidation_features_df.csv", + ) +trainingandvalidation_labels_df_filename = joinpath( + tempdir(), + "trainingandvalidation_labels_df.csv", + ) +testing_features_df_filename = joinpath( + tempdir(), + "testing_features_df.csv", + ) +testing_labels_df_filename = joinpath( + tempdir(), + "testing_labels_df.csv", + ) +training_features_df_filename = joinpath( + tempdir(), + "training_features_df.csv", + ) +training_labels_df_filename = joinpath( + tempdir(), + "training_labels_df.csv", + ) +validation_features_df_filename = joinpath( + tempdir(), + "validation_features_df.csv", + ) +validation_labels_df_filename = joinpath( + tempdir(), + "validation_labels_df.csv", + ) trainingandvalidation_features_df = CSV.read( trainingandvalidation_features_df_filename, DataFrames.DataFrame, @@ -74,11 +90,10 @@ featurenames = vcat(categoricalfeaturenames, continuousfeaturenames) singlelabelname = :MedV labelnames = [singlelabelname] -ENV["linear_regression_filename"] = string( - tempname(), - "_linear_regression.jld2", +linear_regression_filename = joinpath( + tempdir(), + "linear_regression.jld2", ) -linear_regression_filename = ENV["linear_regression_filename"] linear_regression = PredictMD.singlelabeldataframelinearregression( featurenames, diff --git a/examples/boston_housing/03_random_forest_regression.jl b/examples/boston_housing/03_random_forest_regression.jl index 85465e745..39e6e74fb 100644 --- a/examples/boston_housing/03_random_forest_regression.jl +++ b/examples/boston_housing/03_random_forest_regression.jl @@ -4,22 +4,38 @@ import CSV import DataFrames import PredictMD -trainingandvalidation_features_df_filename = - ENV["trainingandvalidation_features_df_filename"] -trainingandvalidation_labels_df_filename = - ENV["trainingandvalidation_labels_df_filename"] -testing_features_df_filename = - ENV["testing_features_df_filename"] -testing_labels_df_filename = - ENV["testing_labels_df_filename"] -training_features_df_filename = - ENV["training_features_df_filename"] -training_labels_df_filename = - ENV["training_labels_df_filename"] -validation_features_df_filename = - ENV["validation_features_df_filename"] -validation_labels_df_filename = - ENV["validation_labels_df_filename"] +trainingandvalidation_features_df_filename = joinpath( + tempdir(), + "trainingandvalidation_features_df.csv", + ) +trainingandvalidation_labels_df_filename = joinpath( + tempdir(), + "trainingandvalidation_labels_df.csv", + ) +testing_features_df_filename = joinpath( + tempdir(), + "testing_features_df.csv", + ) +testing_labels_df_filename = joinpath( + tempdir(), + "testing_labels_df.csv", + ) +training_features_df_filename = joinpath( + tempdir(), + "training_features_df.csv", + ) +training_labels_df_filename = joinpath( + tempdir(), + "training_labels_df.csv", + ) +validation_features_df_filename = joinpath( + tempdir(), + "validation_features_df.csv", + ) +validation_labels_df_filename = joinpath( + tempdir(), + "validation_labels_df.csv", + ) trainingandvalidation_features_df = CSV.read( trainingandvalidation_features_df_filename, DataFrames.DataFrame, @@ -74,11 +90,11 @@ featurenames = vcat(categoricalfeaturenames, continuousfeaturenames) singlelabelname = :MedV labelnames = [singlelabelname] -ENV["random_forest_regression_filename"] = string( - tempname(), - "_random_forest_regression.jld2", + +random_forest_regression_filename = joinpath( + tempdir(), + "random_forest_regression.jld2", ) -random_forest_regression_filename = ENV["random_forest_regression_filename"] feature_contrasts = PredictMD.generate_feature_contrasts(training_features_df, featurenames) diff --git a/examples/boston_housing/04_knet_mlp_regression.jl b/examples/boston_housing/04_knet_mlp_regression.jl index 340de0dc0..4ba167349 100644 --- a/examples/boston_housing/04_knet_mlp_regression.jl +++ b/examples/boston_housing/04_knet_mlp_regression.jl @@ -5,22 +5,38 @@ import DataFrames import Knet import PredictMD -trainingandvalidation_features_df_filename = - ENV["trainingandvalidation_features_df_filename"] -trainingandvalidation_labels_df_filename = - ENV["trainingandvalidation_labels_df_filename"] -testing_features_df_filename = - ENV["testing_features_df_filename"] -testing_labels_df_filename = - ENV["testing_labels_df_filename"] -training_features_df_filename = - ENV["training_features_df_filename"] -training_labels_df_filename = - ENV["training_labels_df_filename"] -validation_features_df_filename = - ENV["validation_features_df_filename"] -validation_labels_df_filename = - ENV["validation_labels_df_filename"] +trainingandvalidation_features_df_filename = joinpath( + tempdir(), + "trainingandvalidation_features_df.csv", + ) +trainingandvalidation_labels_df_filename = joinpath( + tempdir(), + "trainingandvalidation_labels_df.csv", + ) +testing_features_df_filename = joinpath( + tempdir(), + "testing_features_df.csv", + ) +testing_labels_df_filename = joinpath( + tempdir(), + "testing_labels_df.csv", + ) +training_features_df_filename = joinpath( + tempdir(), + "training_features_df.csv", + ) +training_labels_df_filename = joinpath( + tempdir(), + "training_labels_df.csv", + ) +validation_features_df_filename = joinpath( + tempdir(), + "validation_features_df.csv", + ) +validation_labels_df_filename = joinpath( + tempdir(), + "validation_labels_df.csv", + ) trainingandvalidation_features_df = CSV.read( trainingandvalidation_features_df_filename, DataFrames.DataFrame, @@ -75,11 +91,10 @@ featurenames = vcat(categoricalfeaturenames, continuousfeaturenames) singlelabelname = :MedV labelnames = [singlelabelname] -ENV["knet_mlp_regression_filename"] = string( - tempname(), - "_knet_mlp_regression.jld2", +knet_mlp_regression_filename = joinpath( + tempdir(), + "knet_mlp_regression.jld2", ) -knet_mlp_regression_filename = ENV["knet_mlp_regression_filename"] function knetmlp_predict( w, # don't put a type annotation on this diff --git a/examples/boston_housing/05_compare_models.jl b/examples/boston_housing/05_compare_models.jl index 72bd38168..c305e0519 100644 --- a/examples/boston_housing/05_compare_models.jl +++ b/examples/boston_housing/05_compare_models.jl @@ -5,22 +5,38 @@ import DataFrames import Knet import PredictMD -trainingandvalidation_features_df_filename = - ENV["trainingandvalidation_features_df_filename"] -trainingandvalidation_labels_df_filename = - ENV["trainingandvalidation_labels_df_filename"] -testing_features_df_filename = - ENV["testing_features_df_filename"] -testing_labels_df_filename = - ENV["testing_labels_df_filename"] -training_features_df_filename = - ENV["training_features_df_filename"] -training_labels_df_filename = - ENV["training_labels_df_filename"] -validation_features_df_filename = - ENV["validation_features_df_filename"] -validation_labels_df_filename = - ENV["validation_labels_df_filename"] +trainingandvalidation_features_df_filename = joinpath( + tempdir(), + "trainingandvalidation_features_df.csv", + ) +trainingandvalidation_labels_df_filename = joinpath( + tempdir(), + "trainingandvalidation_labels_df.csv", + ) +testing_features_df_filename = joinpath( + tempdir(), + "testing_features_df.csv", + ) +testing_labels_df_filename = joinpath( + tempdir(), + "testing_labels_df.csv", + ) +training_features_df_filename = joinpath( + tempdir(), + "training_features_df.csv", + ) +training_labels_df_filename = joinpath( + tempdir(), + "training_labels_df.csv", + ) +validation_features_df_filename = joinpath( + tempdir(), + "validation_features_df.csv", + ) +validation_labels_df_filename = joinpath( + tempdir(), + "validation_labels_df.csv", + ) trainingandvalidation_features_df = CSV.read( trainingandvalidation_features_df_filename, DataFrames.DataFrame, @@ -54,9 +70,18 @@ validation_labels_df = CSV.read( DataFrames.DataFrame, ) -linear_regression_filename = ENV["linear_regression_filename"] -random_forest_regression_filename = ENV["random_forest_regression_filename"] -knet_mlp_regression_filename = ENV["knet_mlp_regression_filename"] +linear_regression_filename = joinpath( + tempdir(), + "linear_regression.jld2", + ) +random_forest_regression_filename = joinpath( + tempdir(), + "random_forest_regression.jld2", + ) +knet_mlp_regression_filename = joinpath( + tempdir(), + "knet_mlp_regression.jld2", + ) linear_regression = PredictMD.load_model(linear_regression_filename) random_forest_regression = PredictMD.load_model(random_forest_regression_filename) diff --git a/examples/boston_housing/06_get_model_output.jl b/examples/boston_housing/06_get_model_output.jl index d276065ec..5f5662b5c 100644 --- a/examples/boston_housing/06_get_model_output.jl +++ b/examples/boston_housing/06_get_model_output.jl @@ -5,22 +5,38 @@ import DataFrames import Knet import PredictMD -trainingandvalidation_features_df_filename = - ENV["trainingandvalidation_features_df_filename"] -trainingandvalidation_labels_df_filename = - ENV["trainingandvalidation_labels_df_filename"] -testing_features_df_filename = - ENV["testing_features_df_filename"] -testing_labels_df_filename = - ENV["testing_labels_df_filename"] -training_features_df_filename = - ENV["training_features_df_filename"] -training_labels_df_filename = - ENV["training_labels_df_filename"] -validation_features_df_filename = - ENV["validation_features_df_filename"] -validation_labels_df_filename = - ENV["validation_labels_df_filename"] +trainingandvalidation_features_df_filename = joinpath( + tempdir(), + "trainingandvalidation_features_df.csv", + ) +trainingandvalidation_labels_df_filename = joinpath( + tempdir(), + "trainingandvalidation_labels_df.csv", + ) +testing_features_df_filename = joinpath( + tempdir(), + "testing_features_df.csv", + ) +testing_labels_df_filename = joinpath( + tempdir(), + "testing_labels_df.csv", + ) +training_features_df_filename = joinpath( + tempdir(), + "training_features_df.csv", + ) +training_labels_df_filename = joinpath( + tempdir(), + "training_labels_df.csv", + ) +validation_features_df_filename = joinpath( + tempdir(), + "validation_features_df.csv", + ) +validation_labels_df_filename = joinpath( + tempdir(), + "validation_labels_df.csv", + ) trainingandvalidation_features_df = CSV.read( trainingandvalidation_features_df_filename, DataFrames.DataFrame, @@ -54,9 +70,18 @@ validation_labels_df = CSV.read( DataFrames.DataFrame, ) -linear_regression_filename = ENV["linear_regression_filename"] -random_forest_regression_filename = ENV["random_forest_regression_filename"] -knet_mlp_regression_filename = ENV["knet_mlp_regression_filename"] +linear_regression_filename = joinpath( + tempdir(), + "linear_regression.jld2", + ) +random_forest_regression_filename = joinpath( + tempdir(), + "random_forest_regression.jld2", + ) +knet_mlp_regression_filename = joinpath( + tempdir(), + "knet_mlp_regression.jld2", + ) linear_regression = PredictMD.load_model(linear_regression_filename) random_forest_regression = PredictMD.load_model(random_forest_regression_filename) From d61db510e1b25ac193c44dca912d7ef3a4e0b1ba Mon Sep 17 00:00:00 2001 From: Dilum Aluthge Date: Tue, 22 May 2018 01:53:10 -0400 Subject: [PATCH 45/62] Another major progress commit --- docs/make_docs.jl | 3 + examples/boston_housing/01_preprocess_data.jl | 15 ++++ .../boston_housing/02_linear_regression.jl | 26 ++++-- .../03_random_forest_regression.jl | 27 ++++-- .../boston_housing/04_knet_mlp_regression.jl | 26 ++++-- examples/boston_housing/05_compare_models.jl | 20 ++++- .../boston_housing/06_get_model_output.jl | 13 ++- .../01_preprocess_data.jl | 80 +++++++----------- examples/breast_cancer_biopsy/02_smote.jl | 69 ++++++++------- .../03_logistic_classifier.jl | 65 ++++++++++----- .../04_random_forest_classifier.jl | 65 ++++++++++----- .../05_c_svc_svm_classifier.jl | 65 ++++++++++----- .../06_nu_svc_svm_classifier.jl | 69 ++++++++++----- .../07_knet_mlp_classifier.jl | 65 ++++++++++----- .../breast_cancer_biopsy/08_compare_models.jl | 83 ++++++++++++++----- .../09_get_model_output.jl | 83 ++++++++++++++----- src/utils/tikzpictures.jl | 10 ++- 17 files changed, 528 insertions(+), 256 deletions(-) diff --git a/docs/make_docs.jl b/docs/make_docs.jl index 5b69547d4..f4398df9f 100644 --- a/docs/make_docs.jl +++ b/docs/make_docs.jl @@ -17,6 +17,7 @@ examples_output_parent_directory = joinpath( "src", "examples", ) +mkpath(examples_output_parent_directory) boston_housing_input_directory = joinpath( examples_input_parent_directory, @@ -26,6 +27,7 @@ boston_housing_output_directory = joinpath( examples_output_parent_directory, "boston_housing", ) +mkpath(boston_housing_output_directory) Literate.markdown( joinpath(boston_housing_input_directory, "01_preprocess_data.jl"), @@ -143,6 +145,7 @@ breast_cancer_biopsy_output_directory = joinpath( examples_output_parent_directory, "breast_cancer_biopsyg", ) +mkpath(breast_cancer_biopsy_output_directory) Literate.markdown( joinpath(breast_cancer_biopsy_input_directory, "01_preprocess_data.jl"), diff --git a/examples/boston_housing/01_preprocess_data.jl b/examples/boston_housing/01_preprocess_data.jl index 1a180160d..25477b9d9 100644 --- a/examples/boston_housing/01_preprocess_data.jl +++ b/examples/boston_housing/01_preprocess_data.jl @@ -65,36 +65,51 @@ training_features_df, 2/3, # 2/3 of 75% = 50% training, 1/3 of 75% = 25% validation ) +mkpath( + joinpath( + tempdir(), + "boston_housing_example", + ), + ) + trainingandvalidation_features_df_filename = joinpath( tempdir(), + "boston_housing_example", "trainingandvalidation_features_df.csv", ) trainingandvalidation_labels_df_filename = joinpath( tempdir(), + "boston_housing_example", "trainingandvalidation_labels_df.csv", ) testing_features_df_filename = joinpath( tempdir(), + "boston_housing_example", "testing_features_df.csv", ) testing_labels_df_filename = joinpath( tempdir(), + "boston_housing_example", "testing_labels_df.csv", ) training_features_df_filename = joinpath( tempdir(), + "boston_housing_example", "training_features_df.csv", ) training_labels_df_filename = joinpath( tempdir(), + "boston_housing_example", "training_labels_df.csv", ) validation_features_df_filename = joinpath( tempdir(), + "boston_housing_example", "validation_features_df.csv", ) validation_labels_df_filename = joinpath( tempdir(), + "boston_housing_example", "validation_labels_df.csv", ) CSV.write( diff --git a/examples/boston_housing/02_linear_regression.jl b/examples/boston_housing/02_linear_regression.jl index 839be22b3..6edee7a5c 100644 --- a/examples/boston_housing/02_linear_regression.jl +++ b/examples/boston_housing/02_linear_regression.jl @@ -4,36 +4,51 @@ import CSV import DataFrames import PredictMD +mkpath( + joinpath( + tempdir(), + "boston_housing_example", + ), + ) + trainingandvalidation_features_df_filename = joinpath( tempdir(), + "boston_housing_example", "trainingandvalidation_features_df.csv", ) trainingandvalidation_labels_df_filename = joinpath( tempdir(), + "boston_housing_example", "trainingandvalidation_labels_df.csv", ) testing_features_df_filename = joinpath( tempdir(), + "boston_housing_example", "testing_features_df.csv", ) testing_labels_df_filename = joinpath( tempdir(), + "boston_housing_example", "testing_labels_df.csv", ) training_features_df_filename = joinpath( tempdir(), + "boston_housing_example", "training_features_df.csv", ) training_labels_df_filename = joinpath( tempdir(), + "boston_housing_example", "training_labels_df.csv", ) validation_features_df_filename = joinpath( tempdir(), + "boston_housing_example", "validation_features_df.csv", ) validation_labels_df_filename = joinpath( tempdir(), + "boston_housing_example", "validation_labels_df.csv", ) trainingandvalidation_features_df = CSV.read( @@ -90,11 +105,6 @@ featurenames = vcat(categoricalfeaturenames, continuousfeaturenames) singlelabelname = :MedV labelnames = [singlelabelname] -linear_regression_filename = joinpath( - tempdir(), - "linear_regression.jld2", - ) - linear_regression = PredictMD.singlelabeldataframelinearregression( featurenames, singlelabelname; @@ -138,4 +148,10 @@ PredictMD.singlelabelregressionmetrics( singlelabelname, ) +linear_regression_filename = joinpath( + tempdir(), + "boston_housing_example", + "linear_regression.jld2", + ) + PredictMD.save_model(linear_regression_filename, linear_regression) diff --git a/examples/boston_housing/03_random_forest_regression.jl b/examples/boston_housing/03_random_forest_regression.jl index 39e6e74fb..554eaea4b 100644 --- a/examples/boston_housing/03_random_forest_regression.jl +++ b/examples/boston_housing/03_random_forest_regression.jl @@ -4,36 +4,51 @@ import CSV import DataFrames import PredictMD +mkpath( + joinpath( + tempdir(), + "boston_housing_example", + ), + ) + trainingandvalidation_features_df_filename = joinpath( tempdir(), + "boston_housing_example", "trainingandvalidation_features_df.csv", ) trainingandvalidation_labels_df_filename = joinpath( tempdir(), + "boston_housing_example", "trainingandvalidation_labels_df.csv", ) testing_features_df_filename = joinpath( tempdir(), + "boston_housing_example", "testing_features_df.csv", ) testing_labels_df_filename = joinpath( tempdir(), + "boston_housing_example", "testing_labels_df.csv", ) training_features_df_filename = joinpath( tempdir(), + "boston_housing_example", "training_features_df.csv", ) training_labels_df_filename = joinpath( tempdir(), + "boston_housing_example", "training_labels_df.csv", ) validation_features_df_filename = joinpath( tempdir(), + "boston_housing_example", "validation_features_df.csv", ) validation_labels_df_filename = joinpath( tempdir(), + "boston_housing_example", "validation_labels_df.csv", ) trainingandvalidation_features_df = CSV.read( @@ -90,12 +105,6 @@ featurenames = vcat(categoricalfeaturenames, continuousfeaturenames) singlelabelname = :MedV labelnames = [singlelabelname] - -random_forest_regression_filename = joinpath( - tempdir(), - "random_forest_regression.jld2", - ) - feature_contrasts = PredictMD.generate_feature_contrasts(training_features_df, featurenames) random_forest_regression = PredictMD.singlelabeldataframerandomforestregression( @@ -140,4 +149,10 @@ PredictMD.singlelabelregressionmetrics( singlelabelname, ) +random_forest_regression_filename = joinpath( + tempdir(), + "boston_housing_example", + "random_forest_regression.jld2", + ) + PredictMD.save_model(random_forest_regression_filename, random_forest_regression) diff --git a/examples/boston_housing/04_knet_mlp_regression.jl b/examples/boston_housing/04_knet_mlp_regression.jl index 4ba167349..79fe41f3e 100644 --- a/examples/boston_housing/04_knet_mlp_regression.jl +++ b/examples/boston_housing/04_knet_mlp_regression.jl @@ -5,36 +5,51 @@ import DataFrames import Knet import PredictMD +mkpath( + joinpath( + tempdir(), + "boston_housing_example", + ), + ) + trainingandvalidation_features_df_filename = joinpath( tempdir(), + "boston_housing_example", "trainingandvalidation_features_df.csv", ) trainingandvalidation_labels_df_filename = joinpath( tempdir(), + "boston_housing_example", "trainingandvalidation_labels_df.csv", ) testing_features_df_filename = joinpath( tempdir(), + "boston_housing_example", "testing_features_df.csv", ) testing_labels_df_filename = joinpath( tempdir(), + "boston_housing_example", "testing_labels_df.csv", ) training_features_df_filename = joinpath( tempdir(), + "boston_housing_example", "training_features_df.csv", ) training_labels_df_filename = joinpath( tempdir(), + "boston_housing_example", "training_labels_df.csv", ) validation_features_df_filename = joinpath( tempdir(), + "boston_housing_example", "validation_features_df.csv", ) validation_labels_df_filename = joinpath( tempdir(), + "boston_housing_example", "validation_labels_df.csv", ) trainingandvalidation_features_df = CSV.read( @@ -91,11 +106,6 @@ featurenames = vcat(categoricalfeaturenames, continuousfeaturenames) singlelabelname = :MedV labelnames = [singlelabelname] -knet_mlp_regression_filename = joinpath( - tempdir(), - "knet_mlp_regression.jld2", - ) - function knetmlp_predict( w, # don't put a type annotation on this x0::AbstractArray, @@ -249,4 +259,10 @@ PredictMD.singlelabelregressionmetrics( singlelabelname, ) +knet_mlp_regression_filename = joinpath( + tempdir(), + "boston_housing_example", + "knet_mlp_regression.jld2", + ) + PredictMD.save_model(knet_mlp_regression_filename, knet_mlp_regression) diff --git a/examples/boston_housing/05_compare_models.jl b/examples/boston_housing/05_compare_models.jl index c305e0519..3e49332f8 100644 --- a/examples/boston_housing/05_compare_models.jl +++ b/examples/boston_housing/05_compare_models.jl @@ -5,36 +5,51 @@ import DataFrames import Knet import PredictMD +mkpath( + joinpath( + tempdir(), + "boston_housing_example", + ), + ) + trainingandvalidation_features_df_filename = joinpath( tempdir(), + "boston_housing_example", "trainingandvalidation_features_df.csv", ) trainingandvalidation_labels_df_filename = joinpath( tempdir(), + "boston_housing_example", "trainingandvalidation_labels_df.csv", ) testing_features_df_filename = joinpath( tempdir(), + "boston_housing_example", "testing_features_df.csv", ) testing_labels_df_filename = joinpath( tempdir(), + "boston_housing_example", "testing_labels_df.csv", ) training_features_df_filename = joinpath( tempdir(), + "boston_housing_example", "training_features_df.csv", ) training_labels_df_filename = joinpath( tempdir(), + "boston_housing_example", "training_labels_df.csv", ) validation_features_df_filename = joinpath( tempdir(), + "boston_housing_example", "validation_features_df.csv", ) validation_labels_df_filename = joinpath( tempdir(), + "boston_housing_example", "validation_labels_df.csv", ) trainingandvalidation_features_df = CSV.read( @@ -72,14 +87,17 @@ validation_labels_df = CSV.read( linear_regression_filename = joinpath( tempdir(), + "boston_housing_example", "linear_regression.jld2", ) random_forest_regression_filename = joinpath( tempdir(), + "boston_housing_example", "random_forest_regression.jld2", ) knet_mlp_regression_filename = joinpath( tempdir(), + "boston_housing_example", "knet_mlp_regression.jld2", ) @@ -97,7 +115,6 @@ function knetmlp_predict( x2 = w[3]*x1 .+ w[4] # w[3] = weights, w[4] = biases return x2 end - function knetmlp_loss( predict_function::Function, modelweights, # don't put a type annotation on this @@ -121,7 +138,6 @@ function knetmlp_loss( end return loss end - knet_mlp_regression = PredictMD.load_model(knet_mlp_regression_filename) all_models = PredictMD.Fittable[ diff --git a/examples/boston_housing/06_get_model_output.jl b/examples/boston_housing/06_get_model_output.jl index 5f5662b5c..da807a0c2 100644 --- a/examples/boston_housing/06_get_model_output.jl +++ b/examples/boston_housing/06_get_model_output.jl @@ -7,34 +7,42 @@ import PredictMD trainingandvalidation_features_df_filename = joinpath( tempdir(), + "boston_housing_example", "trainingandvalidation_features_df.csv", ) trainingandvalidation_labels_df_filename = joinpath( tempdir(), + "boston_housing_example", "trainingandvalidation_labels_df.csv", ) testing_features_df_filename = joinpath( tempdir(), + "boston_housing_example", "testing_features_df.csv", ) testing_labels_df_filename = joinpath( tempdir(), + "boston_housing_example", "testing_labels_df.csv", ) training_features_df_filename = joinpath( tempdir(), + "boston_housing_example", "training_features_df.csv", ) training_labels_df_filename = joinpath( tempdir(), + "boston_housing_example", "training_labels_df.csv", ) validation_features_df_filename = joinpath( tempdir(), + "boston_housing_example", "validation_features_df.csv", ) validation_labels_df_filename = joinpath( tempdir(), + "boston_housing_example", "validation_labels_df.csv", ) trainingandvalidation_features_df = CSV.read( @@ -72,14 +80,17 @@ validation_labels_df = CSV.read( linear_regression_filename = joinpath( tempdir(), + "boston_housing_example", "linear_regression.jld2", ) random_forest_regression_filename = joinpath( tempdir(), + "boston_housing_example", "random_forest_regression.jld2", ) knet_mlp_regression_filename = joinpath( tempdir(), + "boston_housing_example", "knet_mlp_regression.jld2", ) @@ -97,7 +108,6 @@ function knetmlp_predict( x2 = w[3]*x1 .+ w[4] # w[3] = weights, w[4] = biases return x2 end - function knetmlp_loss( predict_function::Function, modelweights, # don't put a type annotation on this @@ -121,7 +131,6 @@ function knetmlp_loss( end return loss end - knet_mlp_regression = PredictMD.load_model(knet_mlp_regression_filename) PredictMD.predict(linear_regression,training_features_df,) diff --git a/examples/breast_cancer_biopsy/01_preprocess_data.jl b/examples/breast_cancer_biopsy/01_preprocess_data.jl index f4b79d5ed..f025d6f04 100644 --- a/examples/breast_cancer_biopsy/01_preprocess_data.jl +++ b/examples/breast_cancer_biopsy/01_preprocess_data.jl @@ -53,54 +53,38 @@ training_features_df, 2/3, # 2/3 of 75% = 50% training, 1/3 of 75% = 25% validation ) -ENV["trainingandvalidation_features_df_filename"] = string( - tempname(), - "_trainingandvalidation_features_df.csv", - ) -ENV["trainingandvalidation_labels_df_filename"] = string( - tempname(), - "_trainingandvalidation_labels_df.csv", - ) -ENV["testing_features_df_filename"] = string( - tempname(), - "_testing_features_df.csv", - ) -ENV["testing_labels_df_filename"] = string( - tempname(), - "_.testing_labels_dfcsv", - ) -ENV["training_features_df_filename"] = string( - tempname(), - "_training_features_df.csv", - ) -ENV["training_labels_df_filename"] = string( - tempname(), - "_training_labels_df.csv", - ) -ENV["validation_features_df_filename"] = string( - tempname(), - "_validation_features_df.csv", - ) -ENV["validation_labels_df_filename"] = string( - tempname(), - "_validation_labels_df.csv", - ) -trainingandvalidation_features_df_filename = - ENV["trainingandvalidation_features_df_filename"] -trainingandvalidation_labels_df_filename = - ENV["trainingandvalidation_labels_df_filename"] -testing_features_df_filename = - ENV["testing_features_df_filename"] -testing_labels_df_filename = - ENV["testing_labels_df_filename"] -training_features_df_filename = - ENV["training_features_df_filename"] -training_labels_df_filename = - ENV["training_labels_df_filename"] -validation_features_df_filename = - ENV["validation_features_df_filename"] -validation_labels_df_filename = - ENV["validation_labels_df_filename"] +trainingandvalidation_features_df_filename = joinpath( + tempdir(), + "trainingandvalidation_features_df.csv", + ) +trainingandvalidation_labels_df_filename = joinpath( + tempdir(), + "trainingandvalidation_labels_df.csv", + ) +testing_features_df_filename = joinpath( + tempdir(), + "testing_features_df.csv", + ) +testing_labels_df_filename = joinpath( + tempdir(), + "testing_labels_df.csv", + ) +training_features_df_filename = joinpath( + tempdir(), + "training_features_df.csv", + ) +training_labels_df_filename = joinpath( + tempdir(), + "training_labels_df.csv", + ) +validation_features_df_filename = joinpath( + tempdir(), + "validation_features_df.csv", + ) +validation_labels_df_filename = joinpath( + tempdir(), + "validation_labels_df.csv", + ) CSV.write( trainingandvalidation_features_df_filename, trainingandvalidation_features_df, diff --git a/examples/breast_cancer_biopsy/02_smote.jl b/examples/breast_cancer_biopsy/02_smote.jl index a1e80bd47..c1a3a42b5 100644 --- a/examples/breast_cancer_biopsy/02_smote.jl +++ b/examples/breast_cancer_biopsy/02_smote.jl @@ -5,22 +5,38 @@ import DataFrames import PredictMD import StatsBase -trainingandvalidation_features_df_filename = - ENV["trainingandvalidation_features_df_filename"] -trainingandvalidation_labels_df_filename = - ENV["trainingandvalidation_labels_df_filename"] -testing_features_df_filename = - ENV["testing_features_df_filename"] -testing_labels_df_filename = - ENV["testing_labels_df_filename"] -training_features_df_filename = - ENV["training_features_df_filename"] -training_labels_df_filename = - ENV["training_labels_df_filename"] -validation_features_df_filename = - ENV["validation_features_df_filename"] -validation_labels_df_filename = - ENV["validation_labels_df_filename"] +trainingandvalidation_features_df_filename = joinpath( + tempdir(), + "trainingandvalidation_features_df.csv", + ) +trainingandvalidation_labels_df_filename = joinpath( + tempdir(), + "trainingandvalidation_labels_df.csv", + ) +testing_features_df_filename = joinpath( + tempdir(), + "testing_features_df.csv", + ) +testing_labels_df_filename = joinpath( + tempdir(), + "testing_labels_df.csv", + ) +training_features_df_filename = joinpath( + tempdir(), + "training_features_df.csv", + ) +training_labels_df_filename = joinpath( + tempdir(), + "training_labels_df.csv", + ) +validation_features_df_filename = joinpath( + tempdir(), + "validation_features_df.csv", + ) +validation_labels_df_filename = joinpath( + tempdir(), + "validation_labels_df.csv", + ) trainingandvalidation_features_df = CSV.read( trainingandvalidation_features_df_filename, DataFrames.DataFrame, @@ -94,18 +110,15 @@ smoted_training_features_df, smoted_training_labels_df = PredictMD.smote( DataFrames.describe(smoted_training_labels_df[singlelabelname]) StatsBase.countmap(smoted_training_labels_df[singlelabelname]) -ENV["smoted_training_features_df_filename"] = string( - tempname(), - "_smoted_training_features_df.csv", - ) -ENV["smoted_training_labels_df_filename"] = string( - tempname(), - "_smoted_training_labels_df.csv", - ) -smoted_training_features_df_filename = - ENV["smoted_training_features_df_filename"] -smoted_training_labels_df_filename = - ENV["smoted_training_labels_df_filename"] + +smoted_training_features_df_filename = joinpath( + tempdir(), + "smoted_training_features_df.csv", + ) +smoted_training_labels_df_filename = joinpath( + tempdir(), + "smoted_training_labels_df.csv", + ) CSV.write( smoted_training_features_df_filename, smoted_training_features_df, diff --git a/examples/breast_cancer_biopsy/03_logistic_classifier.jl b/examples/breast_cancer_biopsy/03_logistic_classifier.jl index a34c18585..4f1a893c3 100644 --- a/examples/breast_cancer_biopsy/03_logistic_classifier.jl +++ b/examples/breast_cancer_biopsy/03_logistic_classifier.jl @@ -4,22 +4,38 @@ import CSV import DataFrames import PredictMD -trainingandvalidation_features_df_filename = - ENV["trainingandvalidation_features_df_filename"] -trainingandvalidation_labels_df_filename = - ENV["trainingandvalidation_labels_df_filename"] -testing_features_df_filename = - ENV["testing_features_df_filename"] -testing_labels_df_filename = - ENV["testing_labels_df_filename"] -training_features_df_filename = - ENV["training_features_df_filename"] -training_labels_df_filename = - ENV["training_labels_df_filename"] -validation_features_df_filename = - ENV["validation_features_df_filename"] -validation_labels_df_filename = - ENV["validation_labels_df_filename"] +trainingandvalidation_features_df_filename = joinpath( + tempdir(), + "trainingandvalidation_features_df.csv", + ) +trainingandvalidation_labels_df_filename = joinpath( + tempdir(), + "trainingandvalidation_labels_df.csv", + ) +testing_features_df_filename = joinpath( + tempdir(), + "testing_features_df.csv", + ) +testing_labels_df_filename = joinpath( + tempdir(), + "testing_labels_df.csv", + ) +training_features_df_filename = joinpath( + tempdir(), + "training_features_df.csv", + ) +training_labels_df_filename = joinpath( + tempdir(), + "training_labels_df.csv", + ) +validation_features_df_filename = joinpath( + tempdir(), + "validation_features_df.csv", + ) +validation_labels_df_filename = joinpath( + tempdir(), + "validation_labels_df.csv", + ) trainingandvalidation_features_df = CSV.read( trainingandvalidation_features_df_filename, DataFrames.DataFrame, @@ -53,10 +69,14 @@ validation_labels_df = CSV.read( DataFrames.DataFrame, ) -smoted_training_features_df_filename = - ENV["smoted_training_features_df_filename"] -smoted_training_labels_df_filename = - ENV["smoted_training_labels_df_filename"] +smoted_training_features_df_filename = joinpath( + tempdir(), + "smoted_training_features_df.csv", + ) +smoted_training_labels_df_filename = joinpath( + tempdir(), + "smoted_training_labels_df.csv", + ) smoted_training_features_df = CSV.read( smoted_training_features_df_filename, DataFrames.DataFrame, @@ -84,11 +104,10 @@ singlelabelname = :Class negativeclass = "benign" positiveclass = "malignant" singlelabellevels = [negativeclass, positiveclass] -ENV["logistic_classifier_filename"] = string( - tempname(), +logistic_classifier_filename = joinpath( + tempdir(), "logistic_classifier.jld2", ) -logistic_classifier_filename = ENV["logistic_classifier_filename"] feature_contrasts = PredictMD.generate_feature_contrasts( smoted_training_features_df, diff --git a/examples/breast_cancer_biopsy/04_random_forest_classifier.jl b/examples/breast_cancer_biopsy/04_random_forest_classifier.jl index 31cc33b6a..dc7abfccb 100644 --- a/examples/breast_cancer_biopsy/04_random_forest_classifier.jl +++ b/examples/breast_cancer_biopsy/04_random_forest_classifier.jl @@ -5,22 +5,38 @@ import CSV import DataFrames import PredictMD -trainingandvalidation_features_df_filename = - ENV["trainingandvalidation_features_df_filename"] -trainingandvalidation_labels_df_filename = - ENV["trainingandvalidation_labels_df_filename"] -testing_features_df_filename = - ENV["testing_features_df_filename"] -testing_labels_df_filename = - ENV["testing_labels_df_filename"] -training_features_df_filename = - ENV["training_features_df_filename"] -training_labels_df_filename = - ENV["training_labels_df_filename"] -validation_features_df_filename = - ENV["validation_features_df_filename"] -validation_labels_df_filename = - ENV["validation_labels_df_filename"] +trainingandvalidation_features_df_filename = joinpath( + tempdir(), + "trainingandvalidation_features_df.csv", + ) +trainingandvalidation_labels_df_filename = joinpath( + tempdir(), + "trainingandvalidation_labels_df.csv", + ) +testing_features_df_filename = joinpath( + tempdir(), + "testing_features_df.csv", + ) +testing_labels_df_filename = joinpath( + tempdir(), + "testing_labels_df.csv", + ) +training_features_df_filename = joinpath( + tempdir(), + "training_features_df.csv", + ) +training_labels_df_filename = joinpath( + tempdir(), + "training_labels_df.csv", + ) +validation_features_df_filename = joinpath( + tempdir(), + "validation_features_df.csv", + ) +validation_labels_df_filename = joinpath( + tempdir(), + "validation_labels_df.csv", + ) trainingandvalidation_features_df = CSV.read( trainingandvalidation_features_df_filename, DataFrames.DataFrame, @@ -54,10 +70,14 @@ validation_labels_df = CSV.read( DataFrames.DataFrame, ) -smoted_training_features_df_filename = - ENV["smoted_training_features_df_filename"] -smoted_training_labels_df_filename = - ENV["smoted_training_labels_df_filename"] +smoted_training_features_df_filename = joinpath( + tempdir(), + "smoted_training_features_df.csv", + ) +smoted_training_labels_df_filename = joinpath( + tempdir(), + "smoted_training_labels_df.csv", + ) smoted_training_features_df = CSV.read( smoted_training_features_df_filename, DataFrames.DataFrame, @@ -86,11 +106,10 @@ negativeclass = "benign" positiveclass = "malignant" singlelabellevels = [negativeclass, positiveclass] -ENV["random_forest_classifier_filename"] = string( - tempname(), +random_forest_classifier_filename = joinpath( + tempdir(), "random_forest_classifier.jld2", ) -random_forest_classifier_filename = ENV["random_forest_classifier_filename"] feature_contrasts = PredictMD.generate_feature_contrasts( smoted_training_features_df, diff --git a/examples/breast_cancer_biopsy/05_c_svc_svm_classifier.jl b/examples/breast_cancer_biopsy/05_c_svc_svm_classifier.jl index 4543049a8..e18a78af6 100644 --- a/examples/breast_cancer_biopsy/05_c_svc_svm_classifier.jl +++ b/examples/breast_cancer_biopsy/05_c_svc_svm_classifier.jl @@ -5,22 +5,38 @@ import DataFrames import LIBSVM import PredictMD -trainingandvalidation_features_df_filename = - ENV["trainingandvalidation_features_df_filename"] -trainingandvalidation_labels_df_filename = - ENV["trainingandvalidation_labels_df_filename"] -testing_features_df_filename = - ENV["testing_features_df_filename"] -testing_labels_df_filename = - ENV["testing_labels_df_filename"] -training_features_df_filename = - ENV["training_features_df_filename"] -training_labels_df_filename = - ENV["training_labels_df_filename"] -validation_features_df_filename = - ENV["validation_features_df_filename"] -validation_labels_df_filename = - ENV["validation_labels_df_filename"] +trainingandvalidation_features_df_filename = joinpath( + tempdir(), + "trainingandvalidation_features_df.csv", + ) +trainingandvalidation_labels_df_filename = joinpath( + tempdir(), + "trainingandvalidation_labels_df.csv", + ) +testing_features_df_filename = joinpath( + tempdir(), + "testing_features_df.csv", + ) +testing_labels_df_filename = joinpath( + tempdir(), + "testing_labels_df.csv", + ) +training_features_df_filename = joinpath( + tempdir(), + "training_features_df.csv", + ) +training_labels_df_filename = joinpath( + tempdir(), + "training_labels_df.csv", + ) +validation_features_df_filename = joinpath( + tempdir(), + "validation_features_df.csv", + ) +validation_labels_df_filename = joinpath( + tempdir(), + "validation_labels_df.csv", + ) trainingandvalidation_features_df = CSV.read( trainingandvalidation_features_df_filename, DataFrames.DataFrame, @@ -54,10 +70,14 @@ validation_labels_df = CSV.read( DataFrames.DataFrame, ) -smoted_training_features_df_filename = - ENV["smoted_training_features_df_filename"] -smoted_training_labels_df_filename = - ENV["smoted_training_labels_df_filename"] +smoted_training_features_df_filename = joinpath( + tempdir(), + "smoted_training_features_df.csv", + ) +smoted_training_labels_df_filename = joinpath( + tempdir(), + "smoted_training_labels_df.csv", + ) smoted_training_features_df = CSV.read( smoted_training_features_df_filename, DataFrames.DataFrame, @@ -85,11 +105,10 @@ singlelabelname = :Class negativeclass = "benign" positiveclass = "malignant" singlelabellevels = [negativeclass, positiveclass] -ENV["c_svc_svm_classifier_filename"] = string( - tempname(), +c_svc_svm_classifier_filename = joinpath( + tempdir(), "c_svc_svm_classifier.jld2", ) -c_svc_svm_classifier_filename = ENV["c_svc_svm_classifier_filename"] feature_contrasts = PredictMD.generate_feature_contrasts( smoted_training_features_df, diff --git a/examples/breast_cancer_biopsy/06_nu_svc_svm_classifier.jl b/examples/breast_cancer_biopsy/06_nu_svc_svm_classifier.jl index 77e0b88b9..f9b2c5882 100644 --- a/examples/breast_cancer_biopsy/06_nu_svc_svm_classifier.jl +++ b/examples/breast_cancer_biopsy/06_nu_svc_svm_classifier.jl @@ -5,22 +5,42 @@ import DataFrames import LIBSVM import PredictMD -trainingandvalidation_features_df_filename = - ENV["trainingandvalidation_features_df_filename"] -trainingandvalidation_labels_df_filename = - ENV["trainingandvalidation_labels_df_filename"] -testing_features_df_filename = - ENV["testing_features_df_filename"] -testing_labels_df_filename = - ENV["testing_labels_df_filename"] -training_features_df_filename = - ENV["training_features_df_filename"] -training_labels_df_filename = - ENV["training_labels_df_filename"] -validation_features_df_filename = - ENV["validation_features_df_filename"] -validation_labels_df_filename = - ENV["validation_labels_df_filename"] +trainingandvalidation_features_df_filename = joinpath( + tempdir(), + "trainingandvalidation_features_df.csv", + ) +trainingandvalidation_labels_df_filename = joinpath( + tempdir(), + "trainingandvalidation_labels_df.csv", + ) +testing_features_df_filename = joinpath( + tempdir(), + "testing_features_df.csv", + ) +testing_labels_df_filename = joinpath( + tempdir(), + "testing_labels_df.csv", + ) +training_features_df_filename = joinpath( + tempdir(), + "training_features_df.csv", + ) +training_labels_df_filename = joinpath( + tempdir(), + "training_labels_df.csv", + ) +validation_features_df_filename = joinpath( + tempdir(), + "validation_features_df.csv", + ) +validation_labels_df_filename = joinpath( + tempdir(), + "validation_labels_df.csv", + ) +trainingandvalidation_features_df = CSV.read( + trainingandvalidation_features_df_filename, + DataFrames.DataFrame, + ) trainingandvalidation_features_df = CSV.read( trainingandvalidation_features_df_filename, DataFrames.DataFrame, @@ -54,10 +74,14 @@ validation_labels_df = CSV.read( DataFrames.DataFrame, ) -smoted_training_features_df_filename = - ENV["smoted_training_features_df_filename"] -smoted_training_labels_df_filename = - ENV["smoted_training_labels_df_filename"] +smoted_training_features_df_filename = joinpath( + tempdir(), + "smoted_training_features_df.csv", + ) +smoted_training_labels_df_filename = joinpath( + tempdir(), + "smoted_training_labels_df.csv", + ) smoted_training_features_df = CSV.read( smoted_training_features_df_filename, DataFrames.DataFrame, @@ -85,11 +109,10 @@ singlelabelname = :Class negativeclass = "benign" positiveclass = "malignant" singlelabellevels = [negativeclass, positiveclass] -ENV["nu_svc_svm_classifier_filename"] = string( - tempname(), +nu_svc_svm_classifier_filename = joinpath( + tempdir(), "nu_svc_svm_classifier.jld2", ) -nu_svc_svm_classifier_filename = ENV["nu_svc_svm_classifier_filename"] feature_contrasts = PredictMD.generate_feature_contrasts( smoted_training_features_df, diff --git a/examples/breast_cancer_biopsy/07_knet_mlp_classifier.jl b/examples/breast_cancer_biopsy/07_knet_mlp_classifier.jl index f9b08aa3e..730f5ab15 100644 --- a/examples/breast_cancer_biopsy/07_knet_mlp_classifier.jl +++ b/examples/breast_cancer_biopsy/07_knet_mlp_classifier.jl @@ -5,22 +5,38 @@ import DataFrames import Knet import PredictMD -trainingandvalidation_features_df_filename = - ENV["trainingandvalidation_features_df_filename"] -trainingandvalidation_labels_df_filename = - ENV["trainingandvalidation_labels_df_filename"] -testing_features_df_filename = - ENV["testing_features_df_filename"] -testing_labels_df_filename = - ENV["testing_labels_df_filename"] -training_features_df_filename = - ENV["training_features_df_filename"] -training_labels_df_filename = - ENV["training_labels_df_filename"] -validation_features_df_filename = - ENV["validation_features_df_filename"] -validation_labels_df_filename = - ENV["validation_labels_df_filename"] +trainingandvalidation_features_df_filename = joinpath( + tempdir(), + "trainingandvalidation_features_df.csv", + ) +trainingandvalidation_labels_df_filename = joinpath( + tempdir(), + "trainingandvalidation_labels_df.csv", + ) +testing_features_df_filename = joinpath( + tempdir(), + "testing_features_df.csv", + ) +testing_labels_df_filename = joinpath( + tempdir(), + "testing_labels_df.csv", + ) +training_features_df_filename = joinpath( + tempdir(), + "training_features_df.csv", + ) +training_labels_df_filename = joinpath( + tempdir(), + "training_labels_df.csv", + ) +validation_features_df_filename = joinpath( + tempdir(), + "validation_features_df.csv", + ) +validation_labels_df_filename = joinpath( + tempdir(), + "validation_labels_df.csv", + ) trainingandvalidation_features_df = CSV.read( trainingandvalidation_features_df_filename, DataFrames.DataFrame, @@ -54,10 +70,14 @@ validation_labels_df = CSV.read( DataFrames.DataFrame, ) -smoted_training_features_df_filename = - ENV["smoted_training_features_df_filename"] -smoted_training_labels_df_filename = - ENV["smoted_training_labels_df_filename"] +smoted_training_features_df_filename = joinpath( + tempdir(), + "smoted_training_features_df.csv", + ) +smoted_training_labels_df_filename = joinpath( + tempdir(), + "smoted_training_labels_df.csv", + ) smoted_training_features_df = CSV.read( smoted_training_features_df_filename, DataFrames.DataFrame, @@ -86,11 +106,10 @@ negativeclass = "benign" positiveclass = "malignant" singlelabellevels = [negativeclass, positiveclass] -ENV["knet_mlp_classifier_filename"] = string( - tempname(), +knet_mlp_classifier_filename = joinpath( + tempdir(), "knet_mlp_classifier.jld2", ) -knet_mlp_classifier_filename = ENV["knet_mlp_classifier_filename"] function knetmlp_predict( w, # don't put a type annotation on this diff --git a/examples/breast_cancer_biopsy/08_compare_models.jl b/examples/breast_cancer_biopsy/08_compare_models.jl index 511eb07a1..1dd52a489 100644 --- a/examples/breast_cancer_biopsy/08_compare_models.jl +++ b/examples/breast_cancer_biopsy/08_compare_models.jl @@ -5,22 +5,38 @@ import DataFrames import Knet import PredictMD -trainingandvalidation_features_df_filename = - ENV["trainingandvalidation_features_df_filename"] -trainingandvalidation_labels_df_filename = - ENV["trainingandvalidation_labels_df_filename"] -testing_features_df_filename = - ENV["testing_features_df_filename"] -testing_labels_df_filename = - ENV["testing_labels_df_filename"] -training_features_df_filename = - ENV["training_features_df_filename"] -training_labels_df_filename = - ENV["training_labels_df_filename"] -validation_features_df_filename = - ENV["validation_features_df_filename"] -validation_labels_df_filename = - ENV["validation_labels_df_filename"] +trainingandvalidation_features_df_filename = joinpath( + tempdir(), + "trainingandvalidation_features_df.csv", + ) +trainingandvalidation_labels_df_filename = joinpath( + tempdir(), + "trainingandvalidation_labels_df.csv", + ) +testing_features_df_filename = joinpath( + tempdir(), + "testing_features_df.csv", + ) +testing_labels_df_filename = joinpath( + tempdir(), + "testing_labels_df.csv", + ) +training_features_df_filename = joinpath( + tempdir(), + "training_features_df.csv", + ) +training_labels_df_filename = joinpath( + tempdir(), + "training_labels_df.csv", + ) +validation_features_df_filename = joinpath( + tempdir(), + "validation_features_df.csv", + ) +validation_labels_df_filename = joinpath( + tempdir(), + "validation_labels_df.csv", + ) trainingandvalidation_features_df = CSV.read( trainingandvalidation_features_df_filename, DataFrames.DataFrame, @@ -54,10 +70,14 @@ validation_labels_df = CSV.read( DataFrames.DataFrame, ) -smoted_training_features_df_filename = - ENV["smoted_training_features_df_filename"] -smoted_training_labels_df_filename = - ENV["smoted_training_labels_df_filename"] +smoted_training_features_df_filename = joinpath( + tempdir(), + "smoted_training_features_df.csv", + ) +smoted_training_labels_df_filename = joinpath( + tempdir(), + "smoted_training_labels_df.csv", + ) smoted_training_features_df = CSV.read( smoted_training_features_df_filename, DataFrames.DataFrame, @@ -67,6 +87,27 @@ smoted_training_labels_df = CSV.read( DataFrames.DataFrame, ) +logistic_classifier_filename = joinpath( + tempdir(), + "logistic_classifier.jld2", + ) +random_forest_classifier_filename = joinpath( + tempdir(), + "random_forest_classifier.jld2", + ) +c_svc_svm_classifier_filename = joinpath( + tempdir(), + "c_svc_svm_classifier.jld2", + ) +nu_svc_svm_classifier_filename = joinpath( + tempdir(), + "nu_svc_svm_classifier.jld2", + ) +knet_mlp_classifier_filename = joinpath( + tempdir(), + "knet_mlp_classifier.jld2", + ) + logistic_classifier = PredictMD.load_model(logistic_classifier_filename) random_forest_classifier = PredictMD.load_model(random_forest_classifier_filename) c_svc_svm_classifier = PredictMD.load_model(c_svc_svm_classifier_filename) @@ -93,7 +134,6 @@ function knetmlp_predict( return unnormalizedlogprobs end end - function knetmlp_loss( predict::Function, modelweights, # don't put a type annotation on this @@ -119,7 +159,6 @@ function knetmlp_loss( end return loss end - knet_mlp_classifier = PredictMD.load_model(knet_mlp_classifier_filename) all_models = PredictMD.Fittable[ diff --git a/examples/breast_cancer_biopsy/09_get_model_output.jl b/examples/breast_cancer_biopsy/09_get_model_output.jl index 3fbd023c3..455ef477e 100644 --- a/examples/breast_cancer_biopsy/09_get_model_output.jl +++ b/examples/breast_cancer_biopsy/09_get_model_output.jl @@ -5,22 +5,38 @@ import DataFrames import Knet import PredictMD -trainingandvalidation_features_df_filename = - ENV["trainingandvalidation_features_df_filename"] -trainingandvalidation_labels_df_filename = - ENV["trainingandvalidation_labels_df_filename"] -testing_features_df_filename = - ENV["testing_features_df_filename"] -testing_labels_df_filename = - ENV["testing_labels_df_filename"] -training_features_df_filename = - ENV["training_features_df_filename"] -training_labels_df_filename = - ENV["training_labels_df_filename"] -validation_features_df_filename = - ENV["validation_features_df_filename"] -validation_labels_df_filename = - ENV["validation_labels_df_filename"] +trainingandvalidation_features_df_filename = joinpath( + tempdir(), + "trainingandvalidation_features_df.csv", + ) +trainingandvalidation_labels_df_filename = joinpath( + tempdir(), + "trainingandvalidation_labels_df.csv", + ) +testing_features_df_filename = joinpath( + tempdir(), + "testing_features_df.csv", + ) +testing_labels_df_filename = joinpath( + tempdir(), + "testing_labels_df.csv", + ) +training_features_df_filename = joinpath( + tempdir(), + "training_features_df.csv", + ) +training_labels_df_filename = joinpath( + tempdir(), + "training_labels_df.csv", + ) +validation_features_df_filename = joinpath( + tempdir(), + "validation_features_df.csv", + ) +validation_labels_df_filename = joinpath( + tempdir(), + "validation_labels_df.csv", + ) trainingandvalidation_features_df = CSV.read( trainingandvalidation_features_df_filename, DataFrames.DataFrame, @@ -54,10 +70,14 @@ validation_labels_df = CSV.read( DataFrames.DataFrame, ) -smoted_training_features_df_filename = - ENV["smoted_training_features_df_filename"] -smoted_training_labels_df_filename = - ENV["smoted_training_labels_df_filename"] +smoted_training_features_df_filename = joinpath( + tempdir(), + "smoted_training_features_df.csv", + ) +smoted_training_labels_df_filename = joinpath( + tempdir(), + "smoted_training_labels_df.csv", + ) smoted_training_features_df = CSV.read( smoted_training_features_df_filename, DataFrames.DataFrame, @@ -67,6 +87,27 @@ smoted_training_labels_df = CSV.read( DataFrames.DataFrame, ) +logistic_classifier_filename = joinpath( + tempdir(), + "logistic_classifier.jld2", + ) +random_forest_classifier_filename = joinpath( + tempdir(), + "random_forest_classifier.jld2", + ) +c_svc_svm_classifier_filename = joinpath( + tempdir(), + "c_svc_svm_classifier.jld2", + ) +nu_svc_svm_classifier_filename = joinpath( + tempdir(), + "nu_svc_svm_classifier.jld2", + ) +knet_mlp_classifier_filename = joinpath( + tempdir(), + "knet_mlp_classifier.jld2", + ) + logistic_classifier = PredictMD.load_model(logistic_classifier_filename) random_forest_classifier = PredictMD.load_model(random_forest_classifier_filename) c_svc_svm_classifier = PredictMD.load_model(c_svc_svm_classifier_filename) @@ -93,7 +134,6 @@ function knetmlp_predict( return unnormalizedlogprobs end end - function knetmlp_loss( predict::Function, modelweights, # don't put a type annotation on this @@ -119,7 +159,6 @@ function knetmlp_loss( end return loss end - knet_mlp_classifier = PredictMD.load_model(knet_mlp_classifier_filename) PredictMD.predict_proba(logistic_classifier,smoted_training_features_df,) diff --git a/src/utils/tikzpictures.jl b/src/utils/tikzpictures.jl index a59066d12..daa655bd3 100644 --- a/src/utils/tikzpictures.jl +++ b/src/utils/tikzpictures.jl @@ -26,6 +26,8 @@ end """ """ function save_plot_pdf(filename::AbstractString, tp::TikzPictures.TikzPicture) + parent_directory = Base.Filesystem.dirname(filename) + Base.Filesystem.mkpath(parent_directory) result = TikzPictures.save(TikzPictures.PDF(filename), tp) return result end @@ -33,6 +35,8 @@ end """ """ function save_plot_tex(filename::AbstractString, tp::TikzPictures.TikzPicture) + parent_directory = Base.Filesystem.dirname(filename) + Base.Filesystem.mkpath(parent_directory) result = TikzPictures.save(TikzPictures.TEX(filename), tp) return result end @@ -40,6 +44,8 @@ end """ """ function save_plot_tikz(filename::AbstractString, tp::TikzPictures.TikzPicture) + parent_directory = Base.Filesystem.dirname(filename) + Base.Filesystem.mkpath(parent_directory) result = TikzPictures.save(TikzPictures.TIKZ(filename), tp) return result end @@ -47,6 +53,8 @@ end """ """ function save_plot_svg(filename::AbstractString, tp::TikzPictures.TikzPicture) + parent_directory = Base.Filesystem.dirname(filename) + Base.Filesystem.mkpath(parent_directory) result = TikzPictures.save(TikzPictures.SVG(filename), tp) return result end @@ -62,7 +70,7 @@ end """ """ function open_plot(filename::AbstractString, tp::TikzPictures.TikzPicture) - saveresult = save_plot_svg(filename, tp) + saveresult = save_plot(filename, tp) openresult = open_browser_window(filename) return openresult end From 939f12c1d4546cc1747e057c88c9c8bed3d931ba Mon Sep 17 00:00:00 2001 From: Dilum Aluthge Date: Tue, 22 May 2018 01:59:03 -0400 Subject: [PATCH 46/62] progress commit --- .../01_preprocess_data.jl | 15 ++++++++++ examples/breast_cancer_biopsy/02_smote.jl | 17 +++++++++++ .../03_logistic_classifier.jl | 27 +++++++++++++++--- .../04_random_forest_classifier.jl | 28 +++++++++++++++---- .../05_c_svc_svm_classifier.jl | 27 +++++++++++++++--- .../06_nu_svc_svm_classifier.jl | 27 +++++++++++++++--- .../07_knet_mlp_classifier.jl | 28 +++++++++++++++---- .../breast_cancer_biopsy/08_compare_models.jl | 22 +++++++++++++++ .../09_get_model_output.jl | 8 ++++++ 9 files changed, 177 insertions(+), 22 deletions(-) diff --git a/examples/breast_cancer_biopsy/01_preprocess_data.jl b/examples/breast_cancer_biopsy/01_preprocess_data.jl index f025d6f04..39a0c99ee 100644 --- a/examples/breast_cancer_biopsy/01_preprocess_data.jl +++ b/examples/breast_cancer_biopsy/01_preprocess_data.jl @@ -53,36 +53,51 @@ training_features_df, 2/3, # 2/3 of 75% = 50% training, 1/3 of 75% = 25% validation ) +mkpath( + joinpath( + tempdir(), + "breast_cancer_biopsy_example", + ), + ) + trainingandvalidation_features_df_filename = joinpath( tempdir(), + "breast_cancer_biopsy_example", "trainingandvalidation_features_df.csv", ) trainingandvalidation_labels_df_filename = joinpath( tempdir(), + "breast_cancer_biopsy_example", "trainingandvalidation_labels_df.csv", ) testing_features_df_filename = joinpath( tempdir(), + "breast_cancer_biopsy_example", "testing_features_df.csv", ) testing_labels_df_filename = joinpath( tempdir(), + "breast_cancer_biopsy_example", "testing_labels_df.csv", ) training_features_df_filename = joinpath( tempdir(), + "breast_cancer_biopsy_example", "training_features_df.csv", ) training_labels_df_filename = joinpath( tempdir(), + "breast_cancer_biopsy_example", "training_labels_df.csv", ) validation_features_df_filename = joinpath( tempdir(), + "breast_cancer_biopsy_example", "validation_features_df.csv", ) validation_labels_df_filename = joinpath( tempdir(), + "breast_cancer_biopsy_example", "validation_labels_df.csv", ) CSV.write( diff --git a/examples/breast_cancer_biopsy/02_smote.jl b/examples/breast_cancer_biopsy/02_smote.jl index c1a3a42b5..ca68938f4 100644 --- a/examples/breast_cancer_biopsy/02_smote.jl +++ b/examples/breast_cancer_biopsy/02_smote.jl @@ -5,36 +5,51 @@ import DataFrames import PredictMD import StatsBase +mkpath( + joinpath( + tempdir(), + "breast_cancer_biopsy_example", + ), + ) + trainingandvalidation_features_df_filename = joinpath( tempdir(), + "breast_cancer_biopsy_example", "trainingandvalidation_features_df.csv", ) trainingandvalidation_labels_df_filename = joinpath( tempdir(), + "breast_cancer_biopsy_example", "trainingandvalidation_labels_df.csv", ) testing_features_df_filename = joinpath( tempdir(), + "breast_cancer_biopsy_example", "testing_features_df.csv", ) testing_labels_df_filename = joinpath( tempdir(), + "breast_cancer_biopsy_example", "testing_labels_df.csv", ) training_features_df_filename = joinpath( tempdir(), + "breast_cancer_biopsy_example", "training_features_df.csv", ) training_labels_df_filename = joinpath( tempdir(), + "breast_cancer_biopsy_example", "training_labels_df.csv", ) validation_features_df_filename = joinpath( tempdir(), + "breast_cancer_biopsy_example", "validation_features_df.csv", ) validation_labels_df_filename = joinpath( tempdir(), + "breast_cancer_biopsy_example", "validation_labels_df.csv", ) trainingandvalidation_features_df = CSV.read( @@ -113,10 +128,12 @@ StatsBase.countmap(smoted_training_labels_df[singlelabelname]) smoted_training_features_df_filename = joinpath( tempdir(), + "breast_cancer_biopsy_example", "smoted_training_features_df.csv", ) smoted_training_labels_df_filename = joinpath( tempdir(), + "breast_cancer_biopsy_example", "smoted_training_labels_df.csv", ) CSV.write( diff --git a/examples/breast_cancer_biopsy/03_logistic_classifier.jl b/examples/breast_cancer_biopsy/03_logistic_classifier.jl index 4f1a893c3..8cc7d97a4 100644 --- a/examples/breast_cancer_biopsy/03_logistic_classifier.jl +++ b/examples/breast_cancer_biopsy/03_logistic_classifier.jl @@ -4,36 +4,51 @@ import CSV import DataFrames import PredictMD +mkpath( + joinpath( + tempdir(), + "breast_cancer_biopsy_example", + ), + ) + trainingandvalidation_features_df_filename = joinpath( tempdir(), + "breast_cancer_biopsy_example", "trainingandvalidation_features_df.csv", ) trainingandvalidation_labels_df_filename = joinpath( tempdir(), + "breast_cancer_biopsy_example", "trainingandvalidation_labels_df.csv", ) testing_features_df_filename = joinpath( tempdir(), + "breast_cancer_biopsy_example", "testing_features_df.csv", ) testing_labels_df_filename = joinpath( tempdir(), + "breast_cancer_biopsy_example", "testing_labels_df.csv", ) training_features_df_filename = joinpath( tempdir(), + "breast_cancer_biopsy_example", "training_features_df.csv", ) training_labels_df_filename = joinpath( tempdir(), + "breast_cancer_biopsy_example", "training_labels_df.csv", ) validation_features_df_filename = joinpath( tempdir(), + "breast_cancer_biopsy_example", "validation_features_df.csv", ) validation_labels_df_filename = joinpath( tempdir(), + "breast_cancer_biopsy_example", "validation_labels_df.csv", ) trainingandvalidation_features_df = CSV.read( @@ -71,10 +86,12 @@ validation_labels_df = CSV.read( smoted_training_features_df_filename = joinpath( tempdir(), + "breast_cancer_biopsy_example", "smoted_training_features_df.csv", ) smoted_training_labels_df_filename = joinpath( tempdir(), + "breast_cancer_biopsy_example", "smoted_training_labels_df.csv", ) smoted_training_features_df = CSV.read( @@ -104,10 +121,6 @@ singlelabelname = :Class negativeclass = "benign" positiveclass = "malignant" singlelabellevels = [negativeclass, positiveclass] -logistic_classifier_filename = joinpath( - tempdir(), - "logistic_classifier.jld2", - ) feature_contrasts = PredictMD.generate_feature_contrasts( smoted_training_features_df, @@ -220,4 +233,10 @@ println( ) showall(logistic_risk_group_prevalences) +logistic_classifier_filename = joinpath( + tempdir(), + "breast_cancer_biopsy_example", + "logistic_classifier.jld2", + ) + PredictMD.save_model(logistic_classifier_filename, logistic_classifier) diff --git a/examples/breast_cancer_biopsy/04_random_forest_classifier.jl b/examples/breast_cancer_biopsy/04_random_forest_classifier.jl index dc7abfccb..9b9042563 100644 --- a/examples/breast_cancer_biopsy/04_random_forest_classifier.jl +++ b/examples/breast_cancer_biopsy/04_random_forest_classifier.jl @@ -5,36 +5,51 @@ import CSV import DataFrames import PredictMD +mkpath( + joinpath( + tempdir(), + "breast_cancer_biopsy_example", + ), + ) + trainingandvalidation_features_df_filename = joinpath( tempdir(), + "breast_cancer_biopsy_example", "trainingandvalidation_features_df.csv", ) trainingandvalidation_labels_df_filename = joinpath( tempdir(), + "breast_cancer_biopsy_example", "trainingandvalidation_labels_df.csv", ) testing_features_df_filename = joinpath( tempdir(), + "breast_cancer_biopsy_example", "testing_features_df.csv", ) testing_labels_df_filename = joinpath( tempdir(), + "breast_cancer_biopsy_example", "testing_labels_df.csv", ) training_features_df_filename = joinpath( tempdir(), + "breast_cancer_biopsy_example", "training_features_df.csv", ) training_labels_df_filename = joinpath( tempdir(), + "breast_cancer_biopsy_example", "training_labels_df.csv", ) validation_features_df_filename = joinpath( tempdir(), + "breast_cancer_biopsy_example", "validation_features_df.csv", ) validation_labels_df_filename = joinpath( tempdir(), + "breast_cancer_biopsy_example", "validation_labels_df.csv", ) trainingandvalidation_features_df = CSV.read( @@ -72,10 +87,12 @@ validation_labels_df = CSV.read( smoted_training_features_df_filename = joinpath( tempdir(), + "breast_cancer_biopsy_example", "smoted_training_features_df.csv", ) smoted_training_labels_df_filename = joinpath( tempdir(), + "breast_cancer_biopsy_example", "smoted_training_labels_df.csv", ) smoted_training_features_df = CSV.read( @@ -106,11 +123,6 @@ negativeclass = "benign" positiveclass = "malignant" singlelabellevels = [negativeclass, positiveclass] -random_forest_classifier_filename = joinpath( - tempdir(), - "random_forest_classifier.jld2", - ) - feature_contrasts = PredictMD.generate_feature_contrasts( smoted_training_features_df, featurenames, @@ -169,4 +181,10 @@ PredictMD.singlelabelbinaryclassificationmetrics( sensitivity = 0.95, ) +random_forest_classifier_filename = joinpath( + tempdir(), + "breast_cancer_biopsy_example", + "random_forest_classifier.jld2", + ) + PredictMD.save_model(random_forest_classifier_filename, random_forest_classifier) diff --git a/examples/breast_cancer_biopsy/05_c_svc_svm_classifier.jl b/examples/breast_cancer_biopsy/05_c_svc_svm_classifier.jl index e18a78af6..b7f959a30 100644 --- a/examples/breast_cancer_biopsy/05_c_svc_svm_classifier.jl +++ b/examples/breast_cancer_biopsy/05_c_svc_svm_classifier.jl @@ -5,36 +5,51 @@ import DataFrames import LIBSVM import PredictMD +mkpath( + joinpath( + tempdir(), + "breast_cancer_biopsy_example", + ), + ) + trainingandvalidation_features_df_filename = joinpath( tempdir(), + "breast_cancer_biopsy_example", "trainingandvalidation_features_df.csv", ) trainingandvalidation_labels_df_filename = joinpath( tempdir(), + "breast_cancer_biopsy_example", "trainingandvalidation_labels_df.csv", ) testing_features_df_filename = joinpath( tempdir(), + "breast_cancer_biopsy_example", "testing_features_df.csv", ) testing_labels_df_filename = joinpath( tempdir(), + "breast_cancer_biopsy_example", "testing_labels_df.csv", ) training_features_df_filename = joinpath( tempdir(), + "breast_cancer_biopsy_example", "training_features_df.csv", ) training_labels_df_filename = joinpath( tempdir(), + "breast_cancer_biopsy_example", "training_labels_df.csv", ) validation_features_df_filename = joinpath( tempdir(), + "breast_cancer_biopsy_example", "validation_features_df.csv", ) validation_labels_df_filename = joinpath( tempdir(), + "breast_cancer_biopsy_example", "validation_labels_df.csv", ) trainingandvalidation_features_df = CSV.read( @@ -72,10 +87,12 @@ validation_labels_df = CSV.read( smoted_training_features_df_filename = joinpath( tempdir(), + "breast_cancer_biopsy_example", "smoted_training_features_df.csv", ) smoted_training_labels_df_filename = joinpath( tempdir(), + "breast_cancer_biopsy_example", "smoted_training_labels_df.csv", ) smoted_training_features_df = CSV.read( @@ -105,10 +122,6 @@ singlelabelname = :Class negativeclass = "benign" positiveclass = "malignant" singlelabellevels = [negativeclass, positiveclass] -c_svc_svm_classifier_filename = joinpath( - tempdir(), - "c_svc_svm_classifier.jld2", - ) feature_contrasts = PredictMD.generate_feature_contrasts( smoted_training_features_df, @@ -168,4 +181,10 @@ PredictMD.singlelabelbinaryclassificationmetrics( sensitivity = 0.95, ) +c_svc_svm_classifier_filename = joinpath( + tempdir(), + "breast_cancer_biopsy_example", + "c_svc_svm_classifier.jld2", + ) + PredictMD.save_model(c_svc_svm_classifier_filename, c_svc_svm_classifier) diff --git a/examples/breast_cancer_biopsy/06_nu_svc_svm_classifier.jl b/examples/breast_cancer_biopsy/06_nu_svc_svm_classifier.jl index f9b2c5882..04dbef98b 100644 --- a/examples/breast_cancer_biopsy/06_nu_svc_svm_classifier.jl +++ b/examples/breast_cancer_biopsy/06_nu_svc_svm_classifier.jl @@ -5,36 +5,51 @@ import DataFrames import LIBSVM import PredictMD +mkpath( + joinpath( + tempdir(), + "breast_cancer_biopsy_example", + ), + ) + trainingandvalidation_features_df_filename = joinpath( tempdir(), + "breast_cancer_biopsy_example", "trainingandvalidation_features_df.csv", ) trainingandvalidation_labels_df_filename = joinpath( tempdir(), + "breast_cancer_biopsy_example", "trainingandvalidation_labels_df.csv", ) testing_features_df_filename = joinpath( tempdir(), + "breast_cancer_biopsy_example", "testing_features_df.csv", ) testing_labels_df_filename = joinpath( tempdir(), + "breast_cancer_biopsy_example", "testing_labels_df.csv", ) training_features_df_filename = joinpath( tempdir(), + "breast_cancer_biopsy_example", "training_features_df.csv", ) training_labels_df_filename = joinpath( tempdir(), + "breast_cancer_biopsy_example", "training_labels_df.csv", ) validation_features_df_filename = joinpath( tempdir(), + "breast_cancer_biopsy_example", "validation_features_df.csv", ) validation_labels_df_filename = joinpath( tempdir(), + "breast_cancer_biopsy_example", "validation_labels_df.csv", ) trainingandvalidation_features_df = CSV.read( @@ -76,10 +91,12 @@ validation_labels_df = CSV.read( smoted_training_features_df_filename = joinpath( tempdir(), + "breast_cancer_biopsy_example", "smoted_training_features_df.csv", ) smoted_training_labels_df_filename = joinpath( tempdir(), + "breast_cancer_biopsy_example", "smoted_training_labels_df.csv", ) smoted_training_features_df = CSV.read( @@ -109,10 +126,6 @@ singlelabelname = :Class negativeclass = "benign" positiveclass = "malignant" singlelabellevels = [negativeclass, positiveclass] -nu_svc_svm_classifier_filename = joinpath( - tempdir(), - "nu_svc_svm_classifier.jld2", - ) feature_contrasts = PredictMD.generate_feature_contrasts( smoted_training_features_df, @@ -172,4 +185,10 @@ PredictMD.singlelabelbinaryclassificationmetrics( sensitivity = 0.95, ) +nu_svc_svm_classifier_filename = joinpath( + tempdir(), + "breast_cancer_biopsy_example", + "nu_svc_svm_classifier.jld2", + ) + PredictMD.save_model(nu_svc_svm_classifier_filename, nu_svc_svm_classifier) diff --git a/examples/breast_cancer_biopsy/07_knet_mlp_classifier.jl b/examples/breast_cancer_biopsy/07_knet_mlp_classifier.jl index 730f5ab15..4106dfa8c 100644 --- a/examples/breast_cancer_biopsy/07_knet_mlp_classifier.jl +++ b/examples/breast_cancer_biopsy/07_knet_mlp_classifier.jl @@ -5,36 +5,51 @@ import DataFrames import Knet import PredictMD +mkpath( + joinpath( + tempdir(), + "breast_cancer_biopsy_example", + ), + ) + trainingandvalidation_features_df_filename = joinpath( tempdir(), + "breast_cancer_biopsy_example", "trainingandvalidation_features_df.csv", ) trainingandvalidation_labels_df_filename = joinpath( tempdir(), + "breast_cancer_biopsy_example", "trainingandvalidation_labels_df.csv", ) testing_features_df_filename = joinpath( tempdir(), + "breast_cancer_biopsy_example", "testing_features_df.csv", ) testing_labels_df_filename = joinpath( tempdir(), + "breast_cancer_biopsy_example", "testing_labels_df.csv", ) training_features_df_filename = joinpath( tempdir(), + "breast_cancer_biopsy_example", "training_features_df.csv", ) training_labels_df_filename = joinpath( tempdir(), + "breast_cancer_biopsy_example", "training_labels_df.csv", ) validation_features_df_filename = joinpath( tempdir(), + "breast_cancer_biopsy_example", "validation_features_df.csv", ) validation_labels_df_filename = joinpath( tempdir(), + "breast_cancer_biopsy_example", "validation_labels_df.csv", ) trainingandvalidation_features_df = CSV.read( @@ -72,10 +87,12 @@ validation_labels_df = CSV.read( smoted_training_features_df_filename = joinpath( tempdir(), + "breast_cancer_biopsy_example", "smoted_training_features_df.csv", ) smoted_training_labels_df_filename = joinpath( tempdir(), + "breast_cancer_biopsy_example", "smoted_training_labels_df.csv", ) smoted_training_features_df = CSV.read( @@ -106,11 +123,6 @@ negativeclass = "benign" positiveclass = "malignant" singlelabellevels = [negativeclass, positiveclass] -knet_mlp_classifier_filename = joinpath( - tempdir(), - "knet_mlp_classifier.jld2", - ) - function knetmlp_predict( w, # don't put a type annotation on this x0::AbstractArray; @@ -295,4 +307,10 @@ PredictMD.singlelabelbinaryclassificationmetrics( sensitivity = 0.95, ) +knet_mlp_classifier_filename = joinpath( + tempdir(), + "breast_cancer_biopsy_example", + "knet_mlp_classifier.jld2", + ) + PredictMD.save_model(knet_mlp_classifier_filename, knet_mlp_classifier) diff --git a/examples/breast_cancer_biopsy/08_compare_models.jl b/examples/breast_cancer_biopsy/08_compare_models.jl index 1dd52a489..a86f0ddee 100644 --- a/examples/breast_cancer_biopsy/08_compare_models.jl +++ b/examples/breast_cancer_biopsy/08_compare_models.jl @@ -5,36 +5,51 @@ import DataFrames import Knet import PredictMD +mkpath( + joinpath( + tempdir(), + "breast_cancer_biopsy_example", + ), + ) + trainingandvalidation_features_df_filename = joinpath( tempdir(), + "breast_cancer_biopsy_example", "trainingandvalidation_features_df.csv", ) trainingandvalidation_labels_df_filename = joinpath( tempdir(), + "breast_cancer_biopsy_example", "trainingandvalidation_labels_df.csv", ) testing_features_df_filename = joinpath( tempdir(), + "breast_cancer_biopsy_example", "testing_features_df.csv", ) testing_labels_df_filename = joinpath( tempdir(), + "breast_cancer_biopsy_example", "testing_labels_df.csv", ) training_features_df_filename = joinpath( tempdir(), + "breast_cancer_biopsy_example", "training_features_df.csv", ) training_labels_df_filename = joinpath( tempdir(), + "breast_cancer_biopsy_example", "training_labels_df.csv", ) validation_features_df_filename = joinpath( tempdir(), + "breast_cancer_biopsy_example", "validation_features_df.csv", ) validation_labels_df_filename = joinpath( tempdir(), + "breast_cancer_biopsy_example", "validation_labels_df.csv", ) trainingandvalidation_features_df = CSV.read( @@ -72,10 +87,12 @@ validation_labels_df = CSV.read( smoted_training_features_df_filename = joinpath( tempdir(), + "breast_cancer_biopsy_example", "smoted_training_features_df.csv", ) smoted_training_labels_df_filename = joinpath( tempdir(), + "breast_cancer_biopsy_example", "smoted_training_labels_df.csv", ) smoted_training_features_df = CSV.read( @@ -89,22 +106,27 @@ smoted_training_labels_df = CSV.read( logistic_classifier_filename = joinpath( tempdir(), + "breast_cancer_biopsy_example", "logistic_classifier.jld2", ) random_forest_classifier_filename = joinpath( tempdir(), + "breast_cancer_biopsy_example", "random_forest_classifier.jld2", ) c_svc_svm_classifier_filename = joinpath( tempdir(), + "breast_cancer_biopsy_example", "c_svc_svm_classifier.jld2", ) nu_svc_svm_classifier_filename = joinpath( tempdir(), + "breast_cancer_biopsy_example", "nu_svc_svm_classifier.jld2", ) knet_mlp_classifier_filename = joinpath( tempdir(), + "breast_cancer_biopsy_example", "knet_mlp_classifier.jld2", ) diff --git a/examples/breast_cancer_biopsy/09_get_model_output.jl b/examples/breast_cancer_biopsy/09_get_model_output.jl index 455ef477e..57d6de241 100644 --- a/examples/breast_cancer_biopsy/09_get_model_output.jl +++ b/examples/breast_cancer_biopsy/09_get_model_output.jl @@ -7,34 +7,42 @@ import PredictMD trainingandvalidation_features_df_filename = joinpath( tempdir(), + "breast_cancer_biopsy_example", "trainingandvalidation_features_df.csv", ) trainingandvalidation_labels_df_filename = joinpath( tempdir(), + "breast_cancer_biopsy_example", "trainingandvalidation_labels_df.csv", ) testing_features_df_filename = joinpath( tempdir(), + "breast_cancer_biopsy_example", "testing_features_df.csv", ) testing_labels_df_filename = joinpath( tempdir(), + "breast_cancer_biopsy_example", "testing_labels_df.csv", ) training_features_df_filename = joinpath( tempdir(), + "breast_cancer_biopsy_example", "training_features_df.csv", ) training_labels_df_filename = joinpath( tempdir(), + "breast_cancer_biopsy_example", "training_labels_df.csv", ) validation_features_df_filename = joinpath( tempdir(), + "breast_cancer_biopsy_example", "validation_features_df.csv", ) validation_labels_df_filename = joinpath( tempdir(), + "breast_cancer_biopsy_example", "validation_labels_df.csv", ) trainingandvalidation_features_df = CSV.read( From e85b5d5d269d7d4d9928e515efb02835f99434e1 Mon Sep 17 00:00:00 2001 From: Dilum Aluthge Date: Tue, 22 May 2018 02:50:16 -0400 Subject: [PATCH 47/62] Fix bugs --- REQUIRE | 1 + .../boston_housing/02_linear_regression.jl | 4 +-- .../03_random_forest_regression.jl | 4 +-- .../boston_housing/04_knet_mlp_regression.jl | 8 ++--- examples/boston_housing/05_compare_models.jl | 4 +-- .../boston_housing/06_get_model_output.jl | 4 +-- examples/breast_cancer_biopsy/02_smote.jl | 4 +-- .../03_logistic_classifier.jl | 4 +-- .../04_random_forest_classifier.jl | 4 +-- .../05_c_svc_svm_classifier.jl | 4 +-- .../06_nu_svc_svm_classifier.jl | 4 +-- .../07_knet_mlp_classifier.jl | 14 ++++----- .../breast_cancer_biopsy/08_compare_models.jl | 4 +-- .../09_get_model_output.jl | 4 +-- src/PredictMD.jl | 13 ++++++++ src/deprecations/deprecated.jl | 7 +++++ src/integrations/docs/documenter.jl | 1 + src/integrations/docs/literate.jl | 1 + .../ordinary_least_squares_regression.jl | 30 ++++++++++++------- src/utils/trapz.jl | 8 ++--- 20 files changed, 80 insertions(+), 47 deletions(-) create mode 100644 src/deprecations/deprecated.jl create mode 100644 src/integrations/docs/documenter.jl create mode 100644 src/integrations/docs/literate.jl diff --git a/REQUIRE b/REQUIRE index 4a5aedbaf..6072b6daa 100644 --- a/REQUIRE +++ b/REQUIRE @@ -6,6 +6,7 @@ BSON 0.1 CSV 0.2 ClassImbalance 0.2 Combinatorics 0.6 +Compat Crayons 0.4 DataFrames 0.11 DecisionTree 0.6 diff --git a/examples/boston_housing/02_linear_regression.jl b/examples/boston_housing/02_linear_regression.jl index 6edee7a5c..ece66d336 100644 --- a/examples/boston_housing/02_linear_regression.jl +++ b/examples/boston_housing/02_linear_regression.jl @@ -71,8 +71,8 @@ training_features_df = CSV.read( training_features_df_filename, DataFrames.DataFrame, ) -training_features_df = CSV.read( - training_features_df_filename, +training_labels_df = CSV.read( + training_labels_df_filename, DataFrames.DataFrame, ) validation_features_df = CSV.read( diff --git a/examples/boston_housing/03_random_forest_regression.jl b/examples/boston_housing/03_random_forest_regression.jl index 554eaea4b..3c32b9266 100644 --- a/examples/boston_housing/03_random_forest_regression.jl +++ b/examples/boston_housing/03_random_forest_regression.jl @@ -71,8 +71,8 @@ training_features_df = CSV.read( training_features_df_filename, DataFrames.DataFrame, ) -training_features_df = CSV.read( - training_features_df_filename, +training_labels_df = CSV.read( + training_labels_df_filename, DataFrames.DataFrame, ) validation_features_df = CSV.read( diff --git a/examples/boston_housing/04_knet_mlp_regression.jl b/examples/boston_housing/04_knet_mlp_regression.jl index 79fe41f3e..ca0cb7c5e 100644 --- a/examples/boston_housing/04_knet_mlp_regression.jl +++ b/examples/boston_housing/04_knet_mlp_regression.jl @@ -72,8 +72,8 @@ training_features_df = CSV.read( training_features_df_filename, DataFrames.DataFrame, ) -training_features_df = CSV.read( - training_features_df_filename, +training_labels_df = CSV.read( + training_labels_df_filename, DataFrames.DataFrame, ) validation_features_df = CSV.read( @@ -142,6 +142,8 @@ function knetmlp_loss( return loss end +feature_contrasts = PredictMD.generate_feature_contrasts(training_features_df, featurenames) + knetmlp_modelweights = Any[ # input layer has dimension contrasts.num_array_columns # @@ -170,8 +172,6 @@ knetmlp_optimizerhyperparameters = Dict() knetmlp_minibatchsize = 48 knetmlp_maxepochs = 1_000 -feature_contrasts = PredictMD.generate_feature_contrasts(training_features_df, featurenames) - knet_mlp_regression = PredictMD.singlelabeldataframeknetregression( featurenames, singlelabelname; diff --git a/examples/boston_housing/05_compare_models.jl b/examples/boston_housing/05_compare_models.jl index 3e49332f8..1b19dd6b2 100644 --- a/examples/boston_housing/05_compare_models.jl +++ b/examples/boston_housing/05_compare_models.jl @@ -72,8 +72,8 @@ training_features_df = CSV.read( training_features_df_filename, DataFrames.DataFrame, ) -training_features_df = CSV.read( - training_features_df_filename, +training_labels_df = CSV.read( + training_labels_df_filename, DataFrames.DataFrame, ) validation_features_df = CSV.read( diff --git a/examples/boston_housing/06_get_model_output.jl b/examples/boston_housing/06_get_model_output.jl index da807a0c2..16cdd6593 100644 --- a/examples/boston_housing/06_get_model_output.jl +++ b/examples/boston_housing/06_get_model_output.jl @@ -65,8 +65,8 @@ training_features_df = CSV.read( training_features_df_filename, DataFrames.DataFrame, ) -training_features_df = CSV.read( - training_features_df_filename, +training_labels_df = CSV.read( + training_labels_df_filename, DataFrames.DataFrame, ) validation_features_df = CSV.read( diff --git a/examples/breast_cancer_biopsy/02_smote.jl b/examples/breast_cancer_biopsy/02_smote.jl index ca68938f4..22079dab7 100644 --- a/examples/breast_cancer_biopsy/02_smote.jl +++ b/examples/breast_cancer_biopsy/02_smote.jl @@ -72,8 +72,8 @@ training_features_df = CSV.read( training_features_df_filename, DataFrames.DataFrame, ) -training_features_df = CSV.read( - training_features_df_filename, +training_labels_df = CSV.read( + training_labels_df_filename, DataFrames.DataFrame, ) validation_features_df = CSV.read( diff --git a/examples/breast_cancer_biopsy/03_logistic_classifier.jl b/examples/breast_cancer_biopsy/03_logistic_classifier.jl index 8cc7d97a4..acf98a47e 100644 --- a/examples/breast_cancer_biopsy/03_logistic_classifier.jl +++ b/examples/breast_cancer_biopsy/03_logistic_classifier.jl @@ -71,8 +71,8 @@ training_features_df = CSV.read( training_features_df_filename, DataFrames.DataFrame, ) -training_features_df = CSV.read( - training_features_df_filename, +training_labels_df = CSV.read( + training_labels_df_filename, DataFrames.DataFrame, ) validation_features_df = CSV.read( diff --git a/examples/breast_cancer_biopsy/04_random_forest_classifier.jl b/examples/breast_cancer_biopsy/04_random_forest_classifier.jl index 9b9042563..6055e41ed 100644 --- a/examples/breast_cancer_biopsy/04_random_forest_classifier.jl +++ b/examples/breast_cancer_biopsy/04_random_forest_classifier.jl @@ -72,8 +72,8 @@ training_features_df = CSV.read( training_features_df_filename, DataFrames.DataFrame, ) -training_features_df = CSV.read( - training_features_df_filename, +training_labels_df = CSV.read( + training_labels_df_filename, DataFrames.DataFrame, ) validation_features_df = CSV.read( diff --git a/examples/breast_cancer_biopsy/05_c_svc_svm_classifier.jl b/examples/breast_cancer_biopsy/05_c_svc_svm_classifier.jl index b7f959a30..157619a84 100644 --- a/examples/breast_cancer_biopsy/05_c_svc_svm_classifier.jl +++ b/examples/breast_cancer_biopsy/05_c_svc_svm_classifier.jl @@ -72,8 +72,8 @@ training_features_df = CSV.read( training_features_df_filename, DataFrames.DataFrame, ) -training_features_df = CSV.read( - training_features_df_filename, +training_labels_df = CSV.read( + training_labels_df_filename, DataFrames.DataFrame, ) validation_features_df = CSV.read( diff --git a/examples/breast_cancer_biopsy/06_nu_svc_svm_classifier.jl b/examples/breast_cancer_biopsy/06_nu_svc_svm_classifier.jl index 04dbef98b..3214123af 100644 --- a/examples/breast_cancer_biopsy/06_nu_svc_svm_classifier.jl +++ b/examples/breast_cancer_biopsy/06_nu_svc_svm_classifier.jl @@ -76,8 +76,8 @@ training_features_df = CSV.read( training_features_df_filename, DataFrames.DataFrame, ) -training_features_df = CSV.read( - training_features_df_filename, +training_labels_df = CSV.read( + training_labels_df_filename, DataFrames.DataFrame, ) validation_features_df = CSV.read( diff --git a/examples/breast_cancer_biopsy/07_knet_mlp_classifier.jl b/examples/breast_cancer_biopsy/07_knet_mlp_classifier.jl index 4106dfa8c..dc6055548 100644 --- a/examples/breast_cancer_biopsy/07_knet_mlp_classifier.jl +++ b/examples/breast_cancer_biopsy/07_knet_mlp_classifier.jl @@ -72,8 +72,8 @@ training_features_df = CSV.read( training_features_df_filename, DataFrames.DataFrame, ) -training_features_df = CSV.read( - training_features_df_filename, +training_labels_df = CSV.read( + training_labels_df_filename, DataFrames.DataFrame, ) validation_features_df = CSV.read( @@ -171,6 +171,11 @@ function knetmlp_loss( return loss end +feature_contrasts = PredictMD.generate_feature_contrasts( + smoted_training_features_df, + featurenames, + ) + knetmlp_modelweights = Any[ # input layer has dimension contrasts.num_array_columns # @@ -208,11 +213,6 @@ knetmlp_optimizerhyperparameters = Dict() knetmlp_minibatchsize = 48 knetmlp_maxepochs = 1_000 -feature_contrasts = PredictMD.generate_feature_contrasts( - smoted_training_features_df, - featurenames, - ) - knet_mlp_classifier = PredictMD.singlelabelmulticlassdataframeknetclassifier( featurenames, singlelabelname, diff --git a/examples/breast_cancer_biopsy/08_compare_models.jl b/examples/breast_cancer_biopsy/08_compare_models.jl index a86f0ddee..b85e581ef 100644 --- a/examples/breast_cancer_biopsy/08_compare_models.jl +++ b/examples/breast_cancer_biopsy/08_compare_models.jl @@ -72,8 +72,8 @@ training_features_df = CSV.read( training_features_df_filename, DataFrames.DataFrame, ) -training_features_df = CSV.read( - training_features_df_filename, +training_labels_df = CSV.read( + training_labels_df_filename, DataFrames.DataFrame, ) validation_features_df = CSV.read( diff --git a/examples/breast_cancer_biopsy/09_get_model_output.jl b/examples/breast_cancer_biopsy/09_get_model_output.jl index 57d6de241..32807631f 100644 --- a/examples/breast_cancer_biopsy/09_get_model_output.jl +++ b/examples/breast_cancer_biopsy/09_get_model_output.jl @@ -65,8 +65,8 @@ training_features_df = CSV.read( training_features_df_filename, DataFrames.DataFrame, ) -training_features_df = CSV.read( - training_features_df_filename, +training_labels_df = CSV.read( + training_labels_df_filename, DataFrames.DataFrame, ) validation_features_df = CSV.read( diff --git a/src/PredictMD.jl b/src/PredictMD.jl index ce93e9c75..682b1d302 100644 --- a/src/PredictMD.jl +++ b/src/PredictMD.jl @@ -1,12 +1,19 @@ __precompile__(true) +""" +""" module PredictMD # base/ +# (base must go first) include("base/interface.jl") include("base/types.jl") include("base/version.jl") +# deprecations/ +# (deprecations must go second) +include("deprecations/deprecated.jl") + # calibration/ # classimbalance/ @@ -24,9 +31,15 @@ include("datasets/rdatasets.jl") # ensemble/ # gpu/ +""" +""" +module GPU include("gpu/cudnn.jl") +end # integrations/ +include("integrations/docs/documenter.jl") +include("integrations/docs/literate.jl") include("integrations/ide/atom.jl") # io/ diff --git a/src/deprecations/deprecated.jl b/src/deprecations/deprecated.jl new file mode 100644 index 000000000..83f2836de --- /dev/null +++ b/src/deprecations/deprecated.jl @@ -0,0 +1,7 @@ +# Here's an example of what a deprecation looks like: +# function Formula(lhs, rhs) +# Base.depwarn("Formula(lhs, rhs) is deprecated. Use @eval(@formula(\$lhs ~ \$rhs)) if " * +# "parsing is required, or Formula(ex_orig, ex, lhs, rhs) if not", +# :Formula) +# Formula(:(), :($lhs ~ $rhs), lhs, rhs) +# end diff --git a/src/integrations/docs/documenter.jl b/src/integrations/docs/documenter.jl new file mode 100644 index 000000000..e1b372c5d --- /dev/null +++ b/src/integrations/docs/documenter.jl @@ -0,0 +1 @@ +import Documenter diff --git a/src/integrations/docs/literate.jl b/src/integrations/docs/literate.jl new file mode 100644 index 000000000..1c756c6a7 --- /dev/null +++ b/src/integrations/docs/literate.jl @@ -0,0 +1 @@ +import Literate diff --git a/src/linearmodel/ordinary_least_squares_regression.jl b/src/linearmodel/ordinary_least_squares_regression.jl index 7681ffb3f..e9cb2aa57 100644 --- a/src/linearmodel/ordinary_least_squares_regression.jl +++ b/src/linearmodel/ordinary_least_squares_regression.jl @@ -3,25 +3,35 @@ import GLM import StatsModels """ + ordinary_least_squares_regression(x, y; intercept = true) + +Find the best fit line to the set of 2-dimensional points (x, y) using the +ordinary least squares method. + +If intercept is true (default), fit a line of the form y = a + b*x (where a +and b are real numbers) and return the tuple (a, b) + +If intercept is false, fit a line of the form y = b*x (where b is a real +number) and return the tuple (0, b) """ function ordinary_least_squares_regression( - X::AbstractVector{T}, - Y::AbstractVector{T}; + x::AbstractVector{T}, + y::AbstractVector{T}; intercept::Bool = true, ) where T <: Real - if length(X) != length(Y) - error("length(X) != length(Y)") + if length(x) != length(y) + error("length(x) != length(y)") end - if length(X) == 0 - error("length(X) == 0") + if length(x) == 0 + error("length(x) == 0") end data = DataFrames.DataFrame( - X = X, - Y = Y, + x = x, + y = y, ) if intercept estimated_intercept, estimated_x_coefficient = try - ols_regression = GLM.lm(StatsModels.@formula(Y ~ 1 + X),data,) + ols_regression = GLM.lm(StatsModels.@formula(y ~ 1 + x),data,) coefficients = ols_regression.model.pp.beta0 # estimated intercept: coefficients[1] # estimated x coefficient: coefficients[2] @@ -32,7 +42,7 @@ function ordinary_least_squares_regression( end else estimated_intercept, estimated_x_coefficient = try - ols_regression = GLM.lm(StatsModels.@formula(Y ~ 0 + X),data,) + ols_regression = GLM.lm(StatsModels.@formula(y ~ 0 + x),data,) coefficients = ols_regression.model.pp.beta0 # intercept: 0 # estimated x coefficient: coefficients[1] diff --git a/src/utils/trapz.jl b/src/utils/trapz.jl index fda4f4b0b..535040a54 100644 --- a/src/utils/trapz.jl +++ b/src/utils/trapz.jl @@ -1,9 +1,10 @@ import NumericalIntegration """ - trapz(x,y) + trapz(x, y) -Compute the area under the curve of (x,y) points using the trapezoidal method. +Compute the area under the curve of 2-dimensional points (x, y) using the +trapezoidal method. """ function trapz( x::AbstractVector, @@ -15,12 +16,11 @@ function trapz( if length(x) == 0 error("length(x) == 0") end - N = length(x) if !all(x .== sort(x; rev = false)) error("x needs to be sorted in ascending order") end twoI = 0 - for k = 2:N + for k = 2:length(x) twoI += ( y[k] + y[k-1] ) * ( x[k] - x[k-1] ) end I = twoI/2 From 829ceebf7e04996a23b324eeeaaf056fb6bb1d7c Mon Sep 17 00:00:00 2001 From: Dilum Aluthge Date: Tue, 22 May 2018 04:06:43 -0400 Subject: [PATCH 48/62] Fix some more bugs --- examples/breast_cancer_biopsy/03_logistic_classifier.jl | 2 +- .../breast_cancer_biopsy/04_random_forest_classifier.jl | 1 - examples/breast_cancer_biopsy/09_get_model_output.jl | 7 +++++++ 3 files changed, 8 insertions(+), 2 deletions(-) diff --git a/examples/breast_cancer_biopsy/03_logistic_classifier.jl b/examples/breast_cancer_biopsy/03_logistic_classifier.jl index acf98a47e..f0d1198b0 100644 --- a/examples/breast_cancer_biopsy/03_logistic_classifier.jl +++ b/examples/breast_cancer_biopsy/03_logistic_classifier.jl @@ -127,7 +127,7 @@ feature_contrasts = PredictMD.generate_feature_contrasts( featurenames, ) -logistic_classifier = PredictMD.singlelabelbinaryclassdataframelogistic_classifier( +logistic_classifier = PredictMD.singlelabelbinaryclassdataframelogisticclassifier( featurenames, singlelabelname, singlelabellevels; diff --git a/examples/breast_cancer_biopsy/04_random_forest_classifier.jl b/examples/breast_cancer_biopsy/04_random_forest_classifier.jl index 6055e41ed..efa38af9e 100644 --- a/examples/breast_cancer_biopsy/04_random_forest_classifier.jl +++ b/examples/breast_cancer_biopsy/04_random_forest_classifier.jl @@ -1,6 +1,5 @@ srand(999) - import CSV import DataFrames import PredictMD diff --git a/examples/breast_cancer_biopsy/09_get_model_output.jl b/examples/breast_cancer_biopsy/09_get_model_output.jl index 32807631f..3bdf45c77 100644 --- a/examples/breast_cancer_biopsy/09_get_model_output.jl +++ b/examples/breast_cancer_biopsy/09_get_model_output.jl @@ -80,10 +80,12 @@ validation_labels_df = CSV.read( smoted_training_features_df_filename = joinpath( tempdir(), + "breast_cancer_biopsy_example", "smoted_training_features_df.csv", ) smoted_training_labels_df_filename = joinpath( tempdir(), + "breast_cancer_biopsy_example", "smoted_training_labels_df.csv", ) smoted_training_features_df = CSV.read( @@ -97,22 +99,27 @@ smoted_training_labels_df = CSV.read( logistic_classifier_filename = joinpath( tempdir(), + "breast_cancer_biopsy_example", "logistic_classifier.jld2", ) random_forest_classifier_filename = joinpath( tempdir(), + "breast_cancer_biopsy_example", "random_forest_classifier.jld2", ) c_svc_svm_classifier_filename = joinpath( tempdir(), + "breast_cancer_biopsy_example", "c_svc_svm_classifier.jld2", ) nu_svc_svm_classifier_filename = joinpath( tempdir(), + "breast_cancer_biopsy_example", "nu_svc_svm_classifier.jld2", ) knet_mlp_classifier_filename = joinpath( tempdir(), + "breast_cancer_biopsy_example", "knet_mlp_classifier.jld2", ) From 854c95a0599485a43201148c1b5e36d7f672cf46 Mon Sep 17 00:00:00 2001 From: Dilum Aluthge Date: Tue, 22 May 2018 12:09:47 -0400 Subject: [PATCH 49/62] Progress commit --- docs/make_docs.jl | 15 ++++++++++++++- .../boston_housing/01_preprocess_data.jl | 0 .../boston_housing/02_linear_regression.jl | 0 .../boston_housing/03_random_forest_regression.jl | 0 .../boston_housing/04_knet_mlp_regression.jl | 0 .../{ => cpu}/boston_housing/05_compare_models.jl | 0 .../boston_housing/06_get_model_output.jl | 0 .../breast_cancer_biopsy/01_preprocess_data.jl | 0 .../{ => cpu}/breast_cancer_biopsy/02_smote.jl | 0 .../03_logistic_classifier.jl | 0 .../04_random_forest_classifier.jl | 0 .../05_c_svc_svm_classifier.jl | 0 .../06_nu_svc_svm_classifier.jl | 0 .../07_knet_mlp_classifier.jl | 0 .../breast_cancer_biopsy/08_compare_models.jl | 0 .../breast_cancer_biopsy/09_get_model_output.jl | 0 test/runtests.jl | 10 ++++++++-- 17 files changed, 22 insertions(+), 3 deletions(-) rename examples/{ => cpu}/boston_housing/01_preprocess_data.jl (100%) rename examples/{ => cpu}/boston_housing/02_linear_regression.jl (100%) rename examples/{ => cpu}/boston_housing/03_random_forest_regression.jl (100%) rename examples/{ => cpu}/boston_housing/04_knet_mlp_regression.jl (100%) rename examples/{ => cpu}/boston_housing/05_compare_models.jl (100%) rename examples/{ => cpu}/boston_housing/06_get_model_output.jl (100%) rename examples/{ => cpu}/breast_cancer_biopsy/01_preprocess_data.jl (100%) rename examples/{ => cpu}/breast_cancer_biopsy/02_smote.jl (100%) rename examples/{ => cpu}/breast_cancer_biopsy/03_logistic_classifier.jl (100%) rename examples/{ => cpu}/breast_cancer_biopsy/04_random_forest_classifier.jl (100%) rename examples/{ => cpu}/breast_cancer_biopsy/05_c_svc_svm_classifier.jl (100%) rename examples/{ => cpu}/breast_cancer_biopsy/06_nu_svc_svm_classifier.jl (100%) rename examples/{ => cpu}/breast_cancer_biopsy/07_knet_mlp_classifier.jl (100%) rename examples/{ => cpu}/breast_cancer_biopsy/08_compare_models.jl (100%) rename examples/{ => cpu}/breast_cancer_biopsy/09_get_model_output.jl (100%) diff --git a/docs/make_docs.jl b/docs/make_docs.jl index f4398df9f..d61a15e3b 100644 --- a/docs/make_docs.jl +++ b/docs/make_docs.jl @@ -19,6 +19,16 @@ examples_output_parent_directory = joinpath( ) mkpath(examples_output_parent_directory) +cpu_examples_input_parent_directory = joinpath( + examples_input_parent_directory, + "cpu", + ) +cpu_examples_output_parent_directory = joinpath( + examples_output_parent_directory, + "cpu", + ) +mkpath(cpu_examples_output_parent_directory) + boston_housing_input_directory = joinpath( examples_input_parent_directory, "boston_housing", @@ -313,7 +323,10 @@ Literate.script( info("DEBUG: using Documenter.jl to generate Markdown docs") Documenter.makedocs( - modules = [PredictMD], + modules = [ + PredictMD, + PredictMD.GPU, + ], sitename = "PredictMD.jl", pages = Any[ "index.md", diff --git a/examples/boston_housing/01_preprocess_data.jl b/examples/cpu/boston_housing/01_preprocess_data.jl similarity index 100% rename from examples/boston_housing/01_preprocess_data.jl rename to examples/cpu/boston_housing/01_preprocess_data.jl diff --git a/examples/boston_housing/02_linear_regression.jl b/examples/cpu/boston_housing/02_linear_regression.jl similarity index 100% rename from examples/boston_housing/02_linear_regression.jl rename to examples/cpu/boston_housing/02_linear_regression.jl diff --git a/examples/boston_housing/03_random_forest_regression.jl b/examples/cpu/boston_housing/03_random_forest_regression.jl similarity index 100% rename from examples/boston_housing/03_random_forest_regression.jl rename to examples/cpu/boston_housing/03_random_forest_regression.jl diff --git a/examples/boston_housing/04_knet_mlp_regression.jl b/examples/cpu/boston_housing/04_knet_mlp_regression.jl similarity index 100% rename from examples/boston_housing/04_knet_mlp_regression.jl rename to examples/cpu/boston_housing/04_knet_mlp_regression.jl diff --git a/examples/boston_housing/05_compare_models.jl b/examples/cpu/boston_housing/05_compare_models.jl similarity index 100% rename from examples/boston_housing/05_compare_models.jl rename to examples/cpu/boston_housing/05_compare_models.jl diff --git a/examples/boston_housing/06_get_model_output.jl b/examples/cpu/boston_housing/06_get_model_output.jl similarity index 100% rename from examples/boston_housing/06_get_model_output.jl rename to examples/cpu/boston_housing/06_get_model_output.jl diff --git a/examples/breast_cancer_biopsy/01_preprocess_data.jl b/examples/cpu/breast_cancer_biopsy/01_preprocess_data.jl similarity index 100% rename from examples/breast_cancer_biopsy/01_preprocess_data.jl rename to examples/cpu/breast_cancer_biopsy/01_preprocess_data.jl diff --git a/examples/breast_cancer_biopsy/02_smote.jl b/examples/cpu/breast_cancer_biopsy/02_smote.jl similarity index 100% rename from examples/breast_cancer_biopsy/02_smote.jl rename to examples/cpu/breast_cancer_biopsy/02_smote.jl diff --git a/examples/breast_cancer_biopsy/03_logistic_classifier.jl b/examples/cpu/breast_cancer_biopsy/03_logistic_classifier.jl similarity index 100% rename from examples/breast_cancer_biopsy/03_logistic_classifier.jl rename to examples/cpu/breast_cancer_biopsy/03_logistic_classifier.jl diff --git a/examples/breast_cancer_biopsy/04_random_forest_classifier.jl b/examples/cpu/breast_cancer_biopsy/04_random_forest_classifier.jl similarity index 100% rename from examples/breast_cancer_biopsy/04_random_forest_classifier.jl rename to examples/cpu/breast_cancer_biopsy/04_random_forest_classifier.jl diff --git a/examples/breast_cancer_biopsy/05_c_svc_svm_classifier.jl b/examples/cpu/breast_cancer_biopsy/05_c_svc_svm_classifier.jl similarity index 100% rename from examples/breast_cancer_biopsy/05_c_svc_svm_classifier.jl rename to examples/cpu/breast_cancer_biopsy/05_c_svc_svm_classifier.jl diff --git a/examples/breast_cancer_biopsy/06_nu_svc_svm_classifier.jl b/examples/cpu/breast_cancer_biopsy/06_nu_svc_svm_classifier.jl similarity index 100% rename from examples/breast_cancer_biopsy/06_nu_svc_svm_classifier.jl rename to examples/cpu/breast_cancer_biopsy/06_nu_svc_svm_classifier.jl diff --git a/examples/breast_cancer_biopsy/07_knet_mlp_classifier.jl b/examples/cpu/breast_cancer_biopsy/07_knet_mlp_classifier.jl similarity index 100% rename from examples/breast_cancer_biopsy/07_knet_mlp_classifier.jl rename to examples/cpu/breast_cancer_biopsy/07_knet_mlp_classifier.jl diff --git a/examples/breast_cancer_biopsy/08_compare_models.jl b/examples/cpu/breast_cancer_biopsy/08_compare_models.jl similarity index 100% rename from examples/breast_cancer_biopsy/08_compare_models.jl rename to examples/cpu/breast_cancer_biopsy/08_compare_models.jl diff --git a/examples/breast_cancer_biopsy/09_get_model_output.jl b/examples/cpu/breast_cancer_biopsy/09_get_model_output.jl similarity index 100% rename from examples/breast_cancer_biopsy/09_get_model_output.jl rename to examples/cpu/breast_cancer_biopsy/09_get_model_output.jl diff --git a/test/runtests.jl b/test/runtests.jl index 534c17b64..57ce037fe 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -26,13 +26,19 @@ Base.Test.@testset "PredictMD test suite" begin include("cpu/unit/utils/test_fix_vector_type.jl") end end + Base.Test.@testset "Generate documentation and examples" begin + info("INFO generating documentation and examples") + include("../docs/make_docs.jl") + end Base.Test.@testset "Test examples (CPU)" begin info("INFO testing examples (CPU)") Base.Test.@testset "Boston housing regression" begin - # include("") + include( + "../docs/src/examples/cpu/boston_housing/01_preprocess_data.jl" + ) end Base.Test.@testset "Breast cancer biopsy classification" begin - # include("") + include("") end end end From 711d31241bb137a31b290fe70b0086bc96de4671 Mon Sep 17 00:00:00 2001 From: Dilum Aluthge Date: Tue, 22 May 2018 12:10:58 -0400 Subject: [PATCH 50/62] Fix typo --- docs/make_docs.jl | 2 +- test/runtests.jl | 19 +++++++++++++++---- 2 files changed, 16 insertions(+), 5 deletions(-) diff --git a/docs/make_docs.jl b/docs/make_docs.jl index d61a15e3b..4cc6c345f 100644 --- a/docs/make_docs.jl +++ b/docs/make_docs.jl @@ -153,7 +153,7 @@ breast_cancer_biopsy_input_directory = joinpath( ) breast_cancer_biopsy_output_directory = joinpath( examples_output_parent_directory, - "breast_cancer_biopsyg", + "breast_cancer_biopsy", ) mkpath(breast_cancer_biopsy_output_directory) diff --git a/test/runtests.jl b/test/runtests.jl index 57ce037fe..27e9ff390 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -33,12 +33,23 @@ Base.Test.@testset "PredictMD test suite" begin Base.Test.@testset "Test examples (CPU)" begin info("INFO testing examples (CPU)") Base.Test.@testset "Boston housing regression" begin - include( - "../docs/src/examples/cpu/boston_housing/01_preprocess_data.jl" - ) + include("../docs/src/examples/cpu/boston_housing/") + include("../docs/src/examples/cpu/boston_housing/") + include("../docs/src/examples/cpu/boston_housing/") + include("../docs/src/examples/cpu/boston_housing/") + include("../docs/src/examples/cpu/boston_housing/") + include("../docs/src/examples/cpu/boston_housing/") end Base.Test.@testset "Breast cancer biopsy classification" begin - include("") + include("../docs/src/examples/cpu/breast_cancer_biopsy/") + include("../docs/src/examples/cpu/breast_cancer_biopsy/") + include("../docs/src/examples/cpu/breast_cancer_biopsy/") + include("../docs/src/examples/cpu/breast_cancer_biopsy/") + include("../docs/src/examples/cpu/breast_cancer_biopsy/") + include("../docs/src/examples/cpu/breast_cancer_biopsy/") + include("../docs/src/examples/cpu/breast_cancer_biopsy/") + include("../docs/src/examples/cpu/breast_cancer_biopsy/") + include("../docs/src/examples/cpu/breast_cancer_biopsy/") end end end From fc1bf1b67276d82fbb7a7124a07f8524710b218e Mon Sep 17 00:00:00 2001 From: Dilum Aluthge Date: Tue, 22 May 2018 12:17:01 -0400 Subject: [PATCH 51/62] Fixes --- docs/make_docs.jl | 30 +++++++++---------- .../boston_housing/04_knet_mlp_regression.jl | 8 ----- .../07_knet_mlp_classifier.jl | 11 ------- test/runtests.jl | 30 +++++++++---------- 4 files changed, 30 insertions(+), 49 deletions(-) diff --git a/docs/make_docs.jl b/docs/make_docs.jl index 4cc6c345f..dbcd94693 100644 --- a/docs/make_docs.jl +++ b/docs/make_docs.jl @@ -48,7 +48,7 @@ Literate.notebook( joinpath(boston_housing_input_directory, "01_preprocess_data.jl"), boston_housing_output_directory; documenter = true, - execute = false, + execute = true, ) Literate.script( joinpath(boston_housing_input_directory, "01_preprocess_data.jl"), @@ -66,7 +66,7 @@ Literate.notebook( joinpath(boston_housing_input_directory, "02_linear_regression.jl"), boston_housing_output_directory; documenter = true, - execute = false, + execute = true, ) Literate.script( joinpath(boston_housing_input_directory, "02_linear_regression.jl"), @@ -84,7 +84,7 @@ Literate.notebook( joinpath(boston_housing_input_directory, "03_random_forest_regression.jl"), boston_housing_output_directory; documenter = true, - execute = false, + execute = true, ) Literate.script( joinpath(boston_housing_input_directory, "03_random_forest_regression.jl"), @@ -102,7 +102,7 @@ Literate.notebook( joinpath(boston_housing_input_directory, "04_knet_mlp_regression.jl"), boston_housing_output_directory; documenter = true, - execute = false, + execute = true, ) Literate.script( joinpath(boston_housing_input_directory, "04_knet_mlp_regression.jl"), @@ -120,7 +120,7 @@ Literate.notebook( joinpath(boston_housing_input_directory, "05_compare_models.jl"), boston_housing_output_directory; documenter = true, - execute = false, + execute = true, ) Literate.script( joinpath(boston_housing_input_directory, "05_compare_models.jl"), @@ -138,7 +138,7 @@ Literate.notebook( joinpath(boston_housing_input_directory, "06_get_model_output.jl"), boston_housing_output_directory; documenter = true, - execute = false, + execute = true, ) Literate.script( joinpath(boston_housing_input_directory, "06_get_model_output.jl"), @@ -166,7 +166,7 @@ Literate.notebook( joinpath(breast_cancer_biopsy_input_directory, "01_preprocess_data.jl"), breast_cancer_biopsy_output_directory; documenter = true, - execute = false, + execute = true, ) Literate.script( joinpath(breast_cancer_biopsy_input_directory, "01_preprocess_data.jl"), @@ -184,7 +184,7 @@ Literate.notebook( joinpath(breast_cancer_biopsy_input_directory, "02_smote.jl"), breast_cancer_biopsy_output_directory; documenter = true, - execute = false, + execute = true, ) Literate.script( joinpath(breast_cancer_biopsy_input_directory, "02_smote.jl"), @@ -202,7 +202,7 @@ Literate.notebook( joinpath(breast_cancer_biopsy_input_directory, "03_logistic_classifier.jl"), breast_cancer_biopsy_output_directory; documenter = true, - execute = false, + execute = true, ) Literate.script( joinpath(breast_cancer_biopsy_input_directory, "03_logistic_classifier.jl"), @@ -220,7 +220,7 @@ Literate.notebook( joinpath(breast_cancer_biopsy_input_directory, "04_random_forest_classifier.jl"), breast_cancer_biopsy_output_directory; documenter = true, - execute = false, + execute = true, ) Literate.script( joinpath(breast_cancer_biopsy_input_directory, "04_random_forest_classifier.jl"), @@ -238,7 +238,7 @@ Literate.notebook( joinpath(breast_cancer_biopsy_input_directory, "05_c_svc_svm_classifier.jl"), breast_cancer_biopsy_output_directory; documenter = true, - execute = false, + execute = true, ) Literate.script( joinpath(breast_cancer_biopsy_input_directory, "05_c_svc_svm_classifier.jl"), @@ -256,7 +256,7 @@ Literate.notebook( joinpath(breast_cancer_biopsy_input_directory, "06_nu_svc_svm_classifier.jl"), breast_cancer_biopsy_output_directory; documenter = true, - execute = false, + execute = true, ) Literate.script( joinpath(breast_cancer_biopsy_input_directory, "06_nu_svc_svm_classifier.jl"), @@ -274,7 +274,7 @@ Literate.notebook( joinpath(breast_cancer_biopsy_input_directory, "07_knet_mlp_classifier.jl"), breast_cancer_biopsy_output_directory; documenter = true, - execute = false, + execute = true, ) Literate.script( joinpath(breast_cancer_biopsy_input_directory, "07_knet_mlp_classifier.jl"), @@ -292,7 +292,7 @@ Literate.notebook( joinpath(breast_cancer_biopsy_input_directory, "08_compare_models.jl"), breast_cancer_biopsy_output_directory; documenter = true, - execute = false, + execute = true, ) Literate.script( joinpath(breast_cancer_biopsy_input_directory, "08_compare_models.jl"), @@ -310,7 +310,7 @@ Literate.notebook( joinpath(breast_cancer_biopsy_input_directory, "09_get_model_output.jl"), breast_cancer_biopsy_output_directory; documenter = true, - execute = false, + execute = true, ) Literate.script( joinpath(breast_cancer_biopsy_input_directory, "09_get_model_output.jl"), diff --git a/examples/cpu/boston_housing/04_knet_mlp_regression.jl b/examples/cpu/boston_housing/04_knet_mlp_regression.jl index ca0cb7c5e..f730a0313 100644 --- a/examples/cpu/boston_housing/04_knet_mlp_regression.jl +++ b/examples/cpu/boston_housing/04_knet_mlp_regression.jl @@ -110,10 +110,7 @@ function knetmlp_predict( w, # don't put a type annotation on this x0::AbstractArray, ) - # x0 = input layer - # x1 = hidden layer x1 = Knet.relu.( w[1]*x0 .+ w[2] ) # w[1] = weights, w[2] = biases - # x2 = output layer x2 = w[3]*x1 .+ w[4] # w[3] = weights, w[4] = biases return x2 end @@ -145,17 +142,12 @@ end feature_contrasts = PredictMD.generate_feature_contrasts(training_features_df, featurenames) knetmlp_modelweights = Any[ - # input layer has dimension contrasts.num_array_columns - # - # hidden layer (10 neurons): Cfloat.( 0.1f0*randn(Cfloat,10,feature_contrasts.num_array_columns) # weights ), Cfloat.( zeros(Cfloat,10,1) # biases ), - # - # output layer (regression nets have exactly 1 neuron in output layer): Cfloat.( 0.1f0*randn(Cfloat,1,10) # weights ), diff --git a/examples/cpu/breast_cancer_biopsy/07_knet_mlp_classifier.jl b/examples/cpu/breast_cancer_biopsy/07_knet_mlp_classifier.jl index dc6055548..d936e557c 100644 --- a/examples/cpu/breast_cancer_biopsy/07_knet_mlp_classifier.jl +++ b/examples/cpu/breast_cancer_biopsy/07_knet_mlp_classifier.jl @@ -128,12 +128,8 @@ function knetmlp_predict( x0::AbstractArray; probabilities::Bool = true, ) - # x0 = input layer - # x1 = first hidden layer x1 = Knet.relu.( w[1]*x0 .+ w[2] ) # w[1] = weights, w[2] = biases - # x2 = second hidden layer x2 = Knet.relu.( w[3]*x1 .+ w[4] ) # w[3] = weights, w[4] = biases - # x3 = output layer x3 = w[5]*x2 .+ w[6] # w[5] = weights, w[6] = biases unnormalizedlogprobs = x3 if probabilities @@ -177,25 +173,18 @@ feature_contrasts = PredictMD.generate_feature_contrasts( ) knetmlp_modelweights = Any[ - # input layer has dimension contrasts.num_array_columns - # - # first hidden layer (64 neurons): Cfloat.( 0.1f0*randn(Cfloat,64,feature_contrasts.num_array_columns) # weights ), Cfloat.( zeros(Cfloat,64,1) # biases ), - # - # second hidden layer (32 neurons): Cfloat.( 0.1f0*randn(Cfloat,32,64) # weights ), Cfloat.( zeros(Cfloat,32,1) # biases ), - # - # output layer (number of neurons == number of classes): Cfloat.( 0.1f0*randn(Cfloat,2,32) # weights ), diff --git a/test/runtests.jl b/test/runtests.jl index 27e9ff390..f82720913 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -33,23 +33,23 @@ Base.Test.@testset "PredictMD test suite" begin Base.Test.@testset "Test examples (CPU)" begin info("INFO testing examples (CPU)") Base.Test.@testset "Boston housing regression" begin - include("../docs/src/examples/cpu/boston_housing/") - include("../docs/src/examples/cpu/boston_housing/") - include("../docs/src/examples/cpu/boston_housing/") - include("../docs/src/examples/cpu/boston_housing/") - include("../docs/src/examples/cpu/boston_housing/") - include("../docs/src/examples/cpu/boston_housing/") + include("../docs/src/examples/cpu/boston_housing/01_preprocess_data.jl") + include("../docs/src/examples/cpu/boston_housing/02_linear_regression.jl") + include("../docs/src/examples/cpu/boston_housing/03_random_forest_regression.jl") + include("../docs/src/examples/cpu/boston_housing/04_knet_mlp_regression.jl") + include("../docs/src/examples/cpu/boston_housing/05_compare_models.jl") + include("../docs/src/examples/cpu/boston_housing/06_get_model_output.jl") end Base.Test.@testset "Breast cancer biopsy classification" begin - include("../docs/src/examples/cpu/breast_cancer_biopsy/") - include("../docs/src/examples/cpu/breast_cancer_biopsy/") - include("../docs/src/examples/cpu/breast_cancer_biopsy/") - include("../docs/src/examples/cpu/breast_cancer_biopsy/") - include("../docs/src/examples/cpu/breast_cancer_biopsy/") - include("../docs/src/examples/cpu/breast_cancer_biopsy/") - include("../docs/src/examples/cpu/breast_cancer_biopsy/") - include("../docs/src/examples/cpu/breast_cancer_biopsy/") - include("../docs/src/examples/cpu/breast_cancer_biopsy/") + include("../docs/src/examples/cpu/breast_cancer_biopsy/01_preprocess_data.jl") + include("../docs/src/examples/cpu/breast_cancer_biopsy/02_smote.jl") + include("../docs/src/examples/cpu/breast_cancer_biopsy/03_logistic_classifier.jl") + include("../docs/src/examples/cpu/breast_cancer_biopsy/04_random_forest_classifier.jl") + include("../docs/src/examples/cpu/breast_cancer_biopsy/05_c_svc_svm_classifier.jl") + include("../docs/src/examples/cpu/breast_cancer_biopsy/06_nu_svc_svm_classifier.jl") + include("../docs/src/examples/cpu/breast_cancer_biopsy/07_knet_mlp_classifier.jl") + include("../docs/src/examples/cpu/breast_cancer_biopsy/08_compare_models.jl") + include("../docs/src/examples/cpu/breast_cancer_biopsy/09_get_model_output.jl") end end end From 458fa38aee38facf3bd427ef3d4446d177f01b3c Mon Sep 17 00:00:00 2001 From: Dilum Aluthge Date: Tue, 22 May 2018 12:44:46 -0400 Subject: [PATCH 52/62] Progress commit --- docs/make_docs.jl | 11 +++----- .../cpu/boston_housing/01_preprocess_data.jl | 4 +-- .../boston_housing/02_linear_regression.jl | 6 ++--- .../03_random_forest_regression.jl | 6 ++--- .../boston_housing/04_knet_mlp_regression.jl | 18 ++++++------- .../cpu/boston_housing/05_compare_models.jl | 11 +++----- .../cpu/boston_housing/06_get_model_output.jl | 11 +++----- .../01_preprocess_data.jl | 4 +-- examples/cpu/breast_cancer_biopsy/02_smote.jl | 4 +-- .../03_logistic_classifier.jl | 6 ++--- .../04_random_forest_classifier.jl | 6 ++--- .../07_knet_mlp_classifier.jl | 26 +++++++++---------- .../breast_cancer_biopsy/08_compare_models.jl | 16 +++++------- .../09_get_model_output.jl | 16 +++++------- 14 files changed, 64 insertions(+), 81 deletions(-) diff --git a/docs/make_docs.jl b/docs/make_docs.jl index dbcd94693..b73d991d6 100644 --- a/docs/make_docs.jl +++ b/docs/make_docs.jl @@ -1,8 +1,5 @@ import Documenter import Literate - -info("DEBUG: importing PredictMD") - import PredictMD info("DEBUG: using Literate.jl to generate examples") @@ -30,11 +27,11 @@ cpu_examples_output_parent_directory = joinpath( mkpath(cpu_examples_output_parent_directory) boston_housing_input_directory = joinpath( - examples_input_parent_directory, + cpu_examples_input_parent_directory, "boston_housing", ) boston_housing_output_directory = joinpath( - examples_output_parent_directory, + cpu_examples_output_parent_directory, "boston_housing", ) mkpath(boston_housing_output_directory) @@ -148,11 +145,11 @@ Literate.script( ) breast_cancer_biopsy_input_directory = joinpath( - examples_input_parent_directory, + cpu_examples_input_parent_directory, "breast_cancer_biopsy", ) breast_cancer_biopsy_output_directory = joinpath( - examples_output_parent_directory, + cpu_examples_output_parent_directory, "breast_cancer_biopsy", ) mkpath(breast_cancer_biopsy_output_directory) diff --git a/examples/cpu/boston_housing/01_preprocess_data.jl b/examples/cpu/boston_housing/01_preprocess_data.jl index 25477b9d9..4e198bb8d 100644 --- a/examples/cpu/boston_housing/01_preprocess_data.jl +++ b/examples/cpu/boston_housing/01_preprocess_data.jl @@ -54,7 +54,7 @@ trainingandvalidation_features_df, testing_labels_df = PredictMD.split_data( features_df, labels_df, - 0.75, # 75% training/validation, 25% testing + 0.75, ) training_features_df, training_labels_df, @@ -62,7 +62,7 @@ training_features_df, validation_labels_df = PredictMD.split_data( trainingandvalidation_features_df, trainingandvalidation_labels_df, - 2/3, # 2/3 of 75% = 50% training, 1/3 of 75% = 25% validation + 2/3, ) mkpath( diff --git a/examples/cpu/boston_housing/02_linear_regression.jl b/examples/cpu/boston_housing/02_linear_regression.jl index ece66d336..2a8ddd5c8 100644 --- a/examples/cpu/boston_housing/02_linear_regression.jl +++ b/examples/cpu/boston_housing/02_linear_regression.jl @@ -109,9 +109,9 @@ linear_regression = PredictMD.singlelabeldataframelinearregression( featurenames, singlelabelname; package = :GLMjl, - intercept = true, # optional, defaults to true - interactions = 2, # optional, defaults to 1 - name = "Linear regression", # optional + intercept = true, + interactions = 2, + name = "Linear regression", ) PredictMD.fit!(linear_regression,training_features_df,training_labels_df,) diff --git a/examples/cpu/boston_housing/03_random_forest_regression.jl b/examples/cpu/boston_housing/03_random_forest_regression.jl index 3c32b9266..9462e8c71 100644 --- a/examples/cpu/boston_housing/03_random_forest_regression.jl +++ b/examples/cpu/boston_housing/03_random_forest_regression.jl @@ -110,10 +110,10 @@ feature_contrasts = PredictMD.generate_feature_contrasts(training_features_df, f random_forest_regression = PredictMD.singlelabeldataframerandomforestregression( featurenames, singlelabelname; - nsubfeatures = 2, # number of subfeatures; defaults to 2 - ntrees = 20, # number of trees; defaults to 10 + nsubfeatures = 2, + ntrees = 20, package = :DecisionTreejl, - name = "Random forest", # optional + name = "Random forest", feature_contrasts = feature_contrasts, ) diff --git a/examples/cpu/boston_housing/04_knet_mlp_regression.jl b/examples/cpu/boston_housing/04_knet_mlp_regression.jl index f730a0313..16b532573 100644 --- a/examples/cpu/boston_housing/04_knet_mlp_regression.jl +++ b/examples/cpu/boston_housing/04_knet_mlp_regression.jl @@ -107,17 +107,17 @@ singlelabelname = :MedV labelnames = [singlelabelname] function knetmlp_predict( - w, # don't put a type annotation on this + w, x0::AbstractArray, ) - x1 = Knet.relu.( w[1]*x0 .+ w[2] ) # w[1] = weights, w[2] = biases - x2 = w[3]*x1 .+ w[4] # w[3] = weights, w[4] = biases + x1 = Knet.relu.( w[1]*x0 .+ w[2] ) + x2 = w[3]*x1 .+ w[4] return x2 end function knetmlp_loss( predict_function::Function, - modelweights, # don't put a type annotation on this + modelweights, x::AbstractArray, ytrue::AbstractArray; L1::Real = Cfloat(0), @@ -143,16 +143,16 @@ feature_contrasts = PredictMD.generate_feature_contrasts(training_features_df, f knetmlp_modelweights = Any[ Cfloat.( - 0.1f0*randn(Cfloat,10,feature_contrasts.num_array_columns) # weights + 0.1f0*randn(Cfloat,10,feature_contrasts.num_array_columns) ), Cfloat.( - zeros(Cfloat,10,1) # biases + zeros(Cfloat,10,1) ), Cfloat.( - 0.1f0*randn(Cfloat,1,10) # weights + 0.1f0*randn(Cfloat,1,10) ), Cfloat.( - zeros(Cfloat,1,1) # biases + zeros(Cfloat,1,1), ), ] @@ -177,7 +177,7 @@ knet_mlp_regression = PredictMD.singlelabeldataframeknetregression( minibatchsize = knetmlp_minibatchsize, modelweights = knetmlp_modelweights, maxepochs = knetmlp_maxepochs, - printlosseverynepochs = 100, # if 0, will not print at all + printlosseverynepochs = 100, feature_contrasts = feature_contrasts, ) diff --git a/examples/cpu/boston_housing/05_compare_models.jl b/examples/cpu/boston_housing/05_compare_models.jl index 1b19dd6b2..f584a8d3c 100644 --- a/examples/cpu/boston_housing/05_compare_models.jl +++ b/examples/cpu/boston_housing/05_compare_models.jl @@ -105,19 +105,16 @@ linear_regression = PredictMD.load_model(linear_regression_filename) random_forest_regression = PredictMD.load_model(random_forest_regression_filename) function knetmlp_predict( - w, # don't put a type annotation on this + w, x0::AbstractArray, ) - # x0 = input layer - # x1 = hidden layer - x1 = Knet.relu.( w[1]*x0 .+ w[2] ) # w[1] = weights, w[2] = biases - # x2 = output layer - x2 = w[3]*x1 .+ w[4] # w[3] = weights, w[4] = biases + x1 = Knet.relu.( w[1]*x0 .+ w[2] ) + x2 = w[3]*x1 .+ w[4] return x2 end function knetmlp_loss( predict_function::Function, - modelweights, # don't put a type annotation on this + modelweights, x::AbstractArray, ytrue::AbstractArray; L1::Real = Cfloat(0), diff --git a/examples/cpu/boston_housing/06_get_model_output.jl b/examples/cpu/boston_housing/06_get_model_output.jl index 16cdd6593..880bdf167 100644 --- a/examples/cpu/boston_housing/06_get_model_output.jl +++ b/examples/cpu/boston_housing/06_get_model_output.jl @@ -98,19 +98,16 @@ linear_regression = PredictMD.load_model(linear_regression_filename) random_forest_regression = PredictMD.load_model(random_forest_regression_filename) function knetmlp_predict( - w, # don't put a type annotation on this + w, x0::AbstractArray, ) - # x0 = input layer - # x1 = hidden layer - x1 = Knet.relu.( w[1]*x0 .+ w[2] ) # w[1] = weights, w[2] = biases - # x2 = output layer - x2 = w[3]*x1 .+ w[4] # w[3] = weights, w[4] = biases + x1 = Knet.relu.( w[1]*x0 .+ w[2] ) + x2 = w[3]*x1 .+ w[4] return x2 end function knetmlp_loss( predict_function::Function, - modelweights, # don't put a type annotation on this + modelweights, x::AbstractArray, ytrue::AbstractArray; L1::Real = Cfloat(0), diff --git a/examples/cpu/breast_cancer_biopsy/01_preprocess_data.jl b/examples/cpu/breast_cancer_biopsy/01_preprocess_data.jl index 39a0c99ee..257b481c1 100644 --- a/examples/cpu/breast_cancer_biopsy/01_preprocess_data.jl +++ b/examples/cpu/breast_cancer_biopsy/01_preprocess_data.jl @@ -42,7 +42,7 @@ trainingandvalidation_features_df, testing_labels_df = PredictMD.split_data( features_df, labels_df, - 0.75, # 75% training+validation, 25% testing + 0.75, ) training_features_df, training_labels_df, @@ -50,7 +50,7 @@ training_features_df, validation_labels_df = PredictMD.split_data( trainingandvalidation_features_df, trainingandvalidation_labels_df, - 2/3, # 2/3 of 75% = 50% training, 1/3 of 75% = 25% validation + 2/3, ) mkpath( diff --git a/examples/cpu/breast_cancer_biopsy/02_smote.jl b/examples/cpu/breast_cancer_biopsy/02_smote.jl index 22079dab7..c54073775 100644 --- a/examples/cpu/breast_cancer_biopsy/02_smote.jl +++ b/examples/cpu/breast_cancer_biopsy/02_smote.jl @@ -117,8 +117,8 @@ smoted_training_features_df, smoted_training_labels_df = PredictMD.smote( singlelabelname; majorityclass = majorityclass, minorityclass = minorityclass, - pct_over = 100, # how much to oversample the minority class - minority_to_majority_ratio = 1.0, # desired minority:majority ratio + pct_over = 100, + minority_to_majority_ratio = 1.0, k = 5, ) diff --git a/examples/cpu/breast_cancer_biopsy/03_logistic_classifier.jl b/examples/cpu/breast_cancer_biopsy/03_logistic_classifier.jl index f0d1198b0..a0f62d036 100644 --- a/examples/cpu/breast_cancer_biopsy/03_logistic_classifier.jl +++ b/examples/cpu/breast_cancer_biopsy/03_logistic_classifier.jl @@ -132,9 +132,9 @@ logistic_classifier = PredictMD.singlelabelbinaryclassdataframelogisticclassifie singlelabelname, singlelabellevels; package = :GLMjl, - intercept = true, # optional, defaults to true - interactions = 1, # optional, defaults to 1 - name = "Logistic regression", # optional + intercept = true, + interactions = 1, + name = "Logistic regression", ) PredictMD.fit!( diff --git a/examples/cpu/breast_cancer_biopsy/04_random_forest_classifier.jl b/examples/cpu/breast_cancer_biopsy/04_random_forest_classifier.jl index efa38af9e..75992c72d 100644 --- a/examples/cpu/breast_cancer_biopsy/04_random_forest_classifier.jl +++ b/examples/cpu/breast_cancer_biopsy/04_random_forest_classifier.jl @@ -131,10 +131,10 @@ random_forest_classifier = PredictMD.singlelabelmulticlassdataframerandomforestc featurenames, singlelabelname, singlelabellevels; - nsubfeatures = 4, # number of subfeatures; defaults to 2 - ntrees = 200, # number of trees; defaults to 10 + nsubfeatures = 4, + ntrees = 200, package = :DecisionTreejl, - name = "Random forest", # optional + name = "Random forest", feature_contrasts = feature_contrasts, ) diff --git a/examples/cpu/breast_cancer_biopsy/07_knet_mlp_classifier.jl b/examples/cpu/breast_cancer_biopsy/07_knet_mlp_classifier.jl index d936e557c..ccc6a67da 100644 --- a/examples/cpu/breast_cancer_biopsy/07_knet_mlp_classifier.jl +++ b/examples/cpu/breast_cancer_biopsy/07_knet_mlp_classifier.jl @@ -124,13 +124,13 @@ positiveclass = "malignant" singlelabellevels = [negativeclass, positiveclass] function knetmlp_predict( - w, # don't put a type annotation on this + w, x0::AbstractArray; probabilities::Bool = true, ) - x1 = Knet.relu.( w[1]*x0 .+ w[2] ) # w[1] = weights, w[2] = biases - x2 = Knet.relu.( w[3]*x1 .+ w[4] ) # w[3] = weights, w[4] = biases - x3 = w[5]*x2 .+ w[6] # w[5] = weights, w[6] = biases + x1 = Knet.relu.( w[1]*x0 .+ w[2] ) + x2 = Knet.relu.( w[3]*x1 .+ w[4] ) + x3 = w[5]*x2 .+ w[6] unnormalizedlogprobs = x3 if probabilities normalizedlogprobs = Knet.logp(unnormalizedlogprobs, 1) @@ -143,7 +143,7 @@ end function knetmlp_loss( predict::Function, - modelweights, # don't put a type annotation on this + modelweights, x::AbstractArray, ytrue::AbstractArray; L1::Real = Cfloat(0), @@ -156,7 +156,7 @@ function knetmlp_loss( probabilities = false, ), ytrue, - 1, # d = 1 means that instances are in columns + 1, ) if L1 != 0 loss += L1 * sum(sum(abs, w_i) for w_i in modelweights[1:2:end]) @@ -174,22 +174,22 @@ feature_contrasts = PredictMD.generate_feature_contrasts( knetmlp_modelweights = Any[ Cfloat.( - 0.1f0*randn(Cfloat,64,feature_contrasts.num_array_columns) # weights + 0.1f0*randn(Cfloat,64,feature_contrasts.num_array_columns) ), Cfloat.( - zeros(Cfloat,64,1) # biases + zeros(Cfloat,64,1) ), Cfloat.( - 0.1f0*randn(Cfloat,32,64) # weights + 0.1f0*randn(Cfloat,32,64) ), Cfloat.( - zeros(Cfloat,32,1) # biases + zeros(Cfloat,32,1) ), Cfloat.( - 0.1f0*randn(Cfloat,2,32) # weights + 0.1f0*randn(Cfloat,2,32) ), Cfloat.( - zeros(Cfloat,2,1) # biases + zeros(Cfloat,2,1) ), ] @@ -215,7 +215,7 @@ knet_mlp_classifier = PredictMD.singlelabelmulticlassdataframeknetclassifier( optimizerhyperparameters = knetmlp_optimizerhyperparameters, minibatchsize = knetmlp_minibatchsize, modelweights = knetmlp_modelweights, - printlosseverynepochs = 100, # if 0, will not print at all + printlosseverynepochs = 100, maxepochs = knetmlp_maxepochs, feature_contrasts = feature_contrasts, ) diff --git a/examples/cpu/breast_cancer_biopsy/08_compare_models.jl b/examples/cpu/breast_cancer_biopsy/08_compare_models.jl index b85e581ef..503040491 100644 --- a/examples/cpu/breast_cancer_biopsy/08_compare_models.jl +++ b/examples/cpu/breast_cancer_biopsy/08_compare_models.jl @@ -136,17 +136,13 @@ c_svc_svm_classifier = PredictMD.load_model(c_svc_svm_classifier_filename) nu_svc_svm_classifier = PredictMD.load_model(nu_svc_svm_classifier_filename) function knetmlp_predict( - w, # don't put a type annotation on this + w, x0::AbstractArray; probabilities::Bool = true, ) - # x0 = input layer - # x1 = first hidden layer - x1 = Knet.relu.( w[1]*x0 .+ w[2] ) # w[1] = weights, w[2] = biases - # x2 = second hidden layer - x2 = Knet.relu.( w[3]*x1 .+ w[4] ) # w[3] = weights, w[4] = biases - # x3 = output layer - x3 = w[5]*x2 .+ w[6] # w[5] = weights, w[6] = biases + x1 = Knet.relu.( w[1]*x0 .+ w[2] ) + x2 = Knet.relu.( w[3]*x1 .+ w[4] ) + x3 = w[5]*x2 .+ w[6] unnormalizedlogprobs = x3 if probabilities normalizedlogprobs = Knet.logp(unnormalizedlogprobs, 1) @@ -158,7 +154,7 @@ function knetmlp_predict( end function knetmlp_loss( predict::Function, - modelweights, # don't put a type annotation on this + modelweights, x::AbstractArray, ytrue::AbstractArray; L1::Real = Cfloat(0), @@ -171,7 +167,7 @@ function knetmlp_loss( probabilities = false, ), ytrue, - 1, # d = 1 means that instances are in columns + 1, ) if L1 != 0 loss += L1 * sum(sum(abs, w_i) for w_i in modelweights[1:2:end]) diff --git a/examples/cpu/breast_cancer_biopsy/09_get_model_output.jl b/examples/cpu/breast_cancer_biopsy/09_get_model_output.jl index 3bdf45c77..2fcac3270 100644 --- a/examples/cpu/breast_cancer_biopsy/09_get_model_output.jl +++ b/examples/cpu/breast_cancer_biopsy/09_get_model_output.jl @@ -129,17 +129,13 @@ c_svc_svm_classifier = PredictMD.load_model(c_svc_svm_classifier_filename) nu_svc_svm_classifier = PredictMD.load_model(nu_svc_svm_classifier_filename) function knetmlp_predict( - w, # don't put a type annotation on this + w, x0::AbstractArray; probabilities::Bool = true, ) - # x0 = input layer - # x1 = first hidden layer - x1 = Knet.relu.( w[1]*x0 .+ w[2] ) # w[1] = weights, w[2] = biases - # x2 = second hidden layer - x2 = Knet.relu.( w[3]*x1 .+ w[4] ) # w[3] = weights, w[4] = biases - # x3 = output layer - x3 = w[5]*x2 .+ w[6] # w[5] = weights, w[6] = biases + x1 = Knet.relu.( w[1]*x0 .+ w[2] ) + x2 = Knet.relu.( w[3]*x1 .+ w[4] ) + x3 = w[5]*x2 .+ w[6] unnormalizedlogprobs = x3 if probabilities normalizedlogprobs = Knet.logp(unnormalizedlogprobs, 1) @@ -151,7 +147,7 @@ function knetmlp_predict( end function knetmlp_loss( predict::Function, - modelweights, # don't put a type annotation on this + modelweights, x::AbstractArray, ytrue::AbstractArray; L1::Real = Cfloat(0), @@ -164,7 +160,7 @@ function knetmlp_loss( probabilities = false, ), ytrue, - 1, # d = 1 means that instances are in columns + 1, ) if L1 != 0 loss += L1 * sum(sum(abs, w_i) for w_i in modelweights[1:2:end]) From b802ccb729a9b3b2ad94788c8d56808f74b72d5f Mon Sep 17 00:00:00 2001 From: Dilum Aluthge Date: Tue, 22 May 2018 19:58:51 -0400 Subject: [PATCH 53/62] Progress commit --- .../boston_housing/04_knet_mlp_regression.jl | 28 ++-- .../cpu/boston_housing/05_compare_models.jl | 33 +---- .../cpu/boston_housing/06_get_model_output.jl | 33 +---- .../04_random_forest_classifier.jl | 6 +- .../07_knet_mlp_classifier.jl | 36 +++--- .../breast_cancer_biopsy/08_compare_models.jl | 44 +------ .../09_get_model_output.jl | 44 +------ src/io/saveload.jl | 73 ++++++++++- src/linearmodel/glm.jl | 6 + src/neuralnetwork/knet.jl | 122 +++++++++++------- src/pipeline/simplelinearpipeline.jl | 9 ++ src/postprocessing/packagemultilabelpred.jl | 6 + src/postprocessing/packagesinglelabelpred.jl | 6 + src/postprocessing/packagesinglelabelproba.jl | 6 + src/postprocessing/predictoutput.jl | 6 + src/postprocessing/predictprobaoutput.jl | 6 + src/preprocessing/dataframetodecisiontree.jl | 6 + src/preprocessing/dataframetoglm.jl | 6 + src/preprocessing/dataframetoknet.jl | 12 ++ src/preprocessing/dataframetosvm.jl | 6 + src/svm/libsvm.jl | 6 + src/tree/decisiontree.jl | 6 + 22 files changed, 272 insertions(+), 234 deletions(-) diff --git a/examples/cpu/boston_housing/04_knet_mlp_regression.jl b/examples/cpu/boston_housing/04_knet_mlp_regression.jl index 16b532573..df3abbec6 100644 --- a/examples/cpu/boston_housing/04_knet_mlp_regression.jl +++ b/examples/cpu/boston_housing/04_knet_mlp_regression.jl @@ -106,18 +106,21 @@ featurenames = vcat(categoricalfeaturenames, continuousfeaturenames) singlelabelname = :MedV labelnames = [singlelabelname] +knet_mlp_predict_function_source = """ function knetmlp_predict( - w, + w, x0::AbstractArray, ) - x1 = Knet.relu.( w[1]*x0 .+ w[2] ) - x2 = w[3]*x1 .+ w[4] + x1 = Knet.relu.( w[1]*x0 .+ w[2] ) + x2 = w[3]*x1 .+ w[4] return x2 end +""" +knet_mlp_loss_function_source = """ function knetmlp_loss( predict_function::Function, - modelweights, + modelweights, x::AbstractArray, ytrue::AbstractArray; L1::Real = Cfloat(0), @@ -138,21 +141,22 @@ function knetmlp_loss( end return loss end +""" feature_contrasts = PredictMD.generate_feature_contrasts(training_features_df, featurenames) knetmlp_modelweights = Any[ Cfloat.( - 0.1f0*randn(Cfloat,10,feature_contrasts.num_array_columns) + 0.1f0*randn(Cfloat,10,feature_contrasts.num_array_columns) ), Cfloat.( - zeros(Cfloat,10,1) + zeros(Cfloat,10,1) ), Cfloat.( - 0.1f0*randn(Cfloat,1,10) + 0.1f0*randn(Cfloat,1,10) ), Cfloat.( - zeros(Cfloat,1,1), + zeros(Cfloat,1,1), ), ] @@ -169,18 +173,20 @@ knet_mlp_regression = PredictMD.singlelabeldataframeknetregression( singlelabelname; package = :Knetjl, name = "Knet MLP", - predict = knetmlp_predict, - loss = knetmlp_loss, + predict_function_source = knet_mlp_predict_function_source, + loss_function_source = knet_mlp_loss_function_source, losshyperparameters = knetmlp_losshyperparameters, optimizationalgorithm = knetmlp_optimizationalgorithm, optimizerhyperparameters = knetmlp_optimizerhyperparameters, minibatchsize = knetmlp_minibatchsize, modelweights = knetmlp_modelweights, maxepochs = knetmlp_maxepochs, - printlosseverynepochs = 100, + printlosseverynepochs = 100, feature_contrasts = feature_contrasts, ) +PredictMD.parse_functions!(knet_mlp_regression) + PredictMD.fit!( knet_mlp_regression, training_features_df, diff --git a/examples/cpu/boston_housing/05_compare_models.jl b/examples/cpu/boston_housing/05_compare_models.jl index f584a8d3c..e6d2b71a6 100644 --- a/examples/cpu/boston_housing/05_compare_models.jl +++ b/examples/cpu/boston_housing/05_compare_models.jl @@ -103,39 +103,8 @@ knet_mlp_regression_filename = joinpath( linear_regression = PredictMD.load_model(linear_regression_filename) random_forest_regression = PredictMD.load_model(random_forest_regression_filename) - -function knetmlp_predict( - w, - x0::AbstractArray, - ) - x1 = Knet.relu.( w[1]*x0 .+ w[2] ) - x2 = w[3]*x1 .+ w[4] - return x2 -end -function knetmlp_loss( - predict_function::Function, - modelweights, - x::AbstractArray, - ytrue::AbstractArray; - L1::Real = Cfloat(0), - L2::Real = Cfloat(0), - ) - loss = mean( - abs2, - ytrue - predict_function( - modelweights, - x, - ), - ) - if L1 != 0 - loss += L1 * sum(sum(abs, w_i) for w_i in modelweights[1:2:end]) - end - if L2 != 0 - loss += L2 * sum(sum(abs2, w_i) for w_i in modelweights[1:2:end]) - end - return loss -end knet_mlp_regression = PredictMD.load_model(knet_mlp_regression_filename) +PredictMD.parse_functions!(knet_mlp_regression) all_models = PredictMD.Fittable[ linear_regression, diff --git a/examples/cpu/boston_housing/06_get_model_output.jl b/examples/cpu/boston_housing/06_get_model_output.jl index 880bdf167..1d53d2fa8 100644 --- a/examples/cpu/boston_housing/06_get_model_output.jl +++ b/examples/cpu/boston_housing/06_get_model_output.jl @@ -96,39 +96,8 @@ knet_mlp_regression_filename = joinpath( linear_regression = PredictMD.load_model(linear_regression_filename) random_forest_regression = PredictMD.load_model(random_forest_regression_filename) - -function knetmlp_predict( - w, - x0::AbstractArray, - ) - x1 = Knet.relu.( w[1]*x0 .+ w[2] ) - x2 = w[3]*x1 .+ w[4] - return x2 -end -function knetmlp_loss( - predict_function::Function, - modelweights, - x::AbstractArray, - ytrue::AbstractArray; - L1::Real = Cfloat(0), - L2::Real = Cfloat(0), - ) - loss = mean( - abs2, - ytrue - predict_function( - modelweights, - x, - ), - ) - if L1 != 0 - loss += L1 * sum(sum(abs, w_i) for w_i in modelweights[1:2:end]) - end - if L2 != 0 - loss += L2 * sum(sum(abs2, w_i) for w_i in modelweights[1:2:end]) - end - return loss -end knet_mlp_regression = PredictMD.load_model(knet_mlp_regression_filename) +PredictMD.parse_functions!(knet_mlp_regression) PredictMD.predict(linear_regression,training_features_df,) PredictMD.predict(random_forest_regression,training_features_df,) diff --git a/examples/cpu/breast_cancer_biopsy/04_random_forest_classifier.jl b/examples/cpu/breast_cancer_biopsy/04_random_forest_classifier.jl index 75992c72d..31b11da0c 100644 --- a/examples/cpu/breast_cancer_biopsy/04_random_forest_classifier.jl +++ b/examples/cpu/breast_cancer_biopsy/04_random_forest_classifier.jl @@ -131,10 +131,10 @@ random_forest_classifier = PredictMD.singlelabelmulticlassdataframerandomforestc featurenames, singlelabelname, singlelabellevels; - nsubfeatures = 4, - ntrees = 200, + nsubfeatures = 4, + ntrees = 200, package = :DecisionTreejl, - name = "Random forest", + name = "Random forest", feature_contrasts = feature_contrasts, ) diff --git a/examples/cpu/breast_cancer_biopsy/07_knet_mlp_classifier.jl b/examples/cpu/breast_cancer_biopsy/07_knet_mlp_classifier.jl index ccc6a67da..55218e6a0 100644 --- a/examples/cpu/breast_cancer_biopsy/07_knet_mlp_classifier.jl +++ b/examples/cpu/breast_cancer_biopsy/07_knet_mlp_classifier.jl @@ -123,14 +123,15 @@ negativeclass = "benign" positiveclass = "malignant" singlelabellevels = [negativeclass, positiveclass] +knet_mlp_predict_function_source = """ function knetmlp_predict( - w, + w, x0::AbstractArray; probabilities::Bool = true, ) - x1 = Knet.relu.( w[1]*x0 .+ w[2] ) - x2 = Knet.relu.( w[3]*x1 .+ w[4] ) - x3 = w[5]*x2 .+ w[6] + x1 = Knet.relu.( w[1]*x0 .+ w[2] ) + x2 = Knet.relu.( w[3]*x1 .+ w[4] ) + x3 = w[5]*x2 .+ w[6] unnormalizedlogprobs = x3 if probabilities normalizedlogprobs = Knet.logp(unnormalizedlogprobs, 1) @@ -140,10 +141,12 @@ function knetmlp_predict( return unnormalizedlogprobs end end +""" +knet_mlp_loss_function_source = """ function knetmlp_loss( predict::Function, - modelweights, + modelweights, x::AbstractArray, ytrue::AbstractArray; L1::Real = Cfloat(0), @@ -156,7 +159,7 @@ function knetmlp_loss( probabilities = false, ), ytrue, - 1, + 1, ) if L1 != 0 loss += L1 * sum(sum(abs, w_i) for w_i in modelweights[1:2:end]) @@ -166,6 +169,7 @@ function knetmlp_loss( end return loss end +""" feature_contrasts = PredictMD.generate_feature_contrasts( smoted_training_features_df, @@ -174,22 +178,22 @@ feature_contrasts = PredictMD.generate_feature_contrasts( knetmlp_modelweights = Any[ Cfloat.( - 0.1f0*randn(Cfloat,64,feature_contrasts.num_array_columns) + 0.1f0*randn(Cfloat,64,feature_contrasts.num_array_columns) ), Cfloat.( - zeros(Cfloat,64,1) + zeros(Cfloat,64,1) ), Cfloat.( - 0.1f0*randn(Cfloat,32,64) + 0.1f0*randn(Cfloat,32,64) ), Cfloat.( - zeros(Cfloat,32,1) + zeros(Cfloat,32,1) ), Cfloat.( - 0.1f0*randn(Cfloat,2,32) + 0.1f0*randn(Cfloat,2,32) ), Cfloat.( - zeros(Cfloat,2,1) + zeros(Cfloat,2,1) ), ] @@ -208,18 +212,20 @@ knet_mlp_classifier = PredictMD.singlelabelmulticlassdataframeknetclassifier( singlelabellevels; package = :Knetjl, name = "Knet MLP", - predict = knetmlp_predict, - loss = knetmlp_loss, + predict_function_source = knet_mlp_predict_function_source, + loss_function_source = knet_mlp_loss_function_source, losshyperparameters = knetmlp_losshyperparameters, optimizationalgorithm = knetmlp_optimizationalgorithm, optimizerhyperparameters = knetmlp_optimizerhyperparameters, minibatchsize = knetmlp_minibatchsize, modelweights = knetmlp_modelweights, - printlosseverynepochs = 100, + printlosseverynepochs = 100, maxepochs = knetmlp_maxepochs, feature_contrasts = feature_contrasts, ) +PredictMD.parse_functions!(knet_mlp_classifier) + PredictMD.fit!( knet_mlp_classifier, smoted_training_features_df, diff --git a/examples/cpu/breast_cancer_biopsy/08_compare_models.jl b/examples/cpu/breast_cancer_biopsy/08_compare_models.jl index 503040491..098f0c3f1 100644 --- a/examples/cpu/breast_cancer_biopsy/08_compare_models.jl +++ b/examples/cpu/breast_cancer_biopsy/08_compare_models.jl @@ -134,50 +134,8 @@ logistic_classifier = PredictMD.load_model(logistic_classifier_filename) random_forest_classifier = PredictMD.load_model(random_forest_classifier_filename) c_svc_svm_classifier = PredictMD.load_model(c_svc_svm_classifier_filename) nu_svc_svm_classifier = PredictMD.load_model(nu_svc_svm_classifier_filename) - -function knetmlp_predict( - w, - x0::AbstractArray; - probabilities::Bool = true, - ) - x1 = Knet.relu.( w[1]*x0 .+ w[2] ) - x2 = Knet.relu.( w[3]*x1 .+ w[4] ) - x3 = w[5]*x2 .+ w[6] - unnormalizedlogprobs = x3 - if probabilities - normalizedlogprobs = Knet.logp(unnormalizedlogprobs, 1) - normalizedprobs = exp.(normalizedlogprobs) - return normalizedprobs - else - return unnormalizedlogprobs - end -end -function knetmlp_loss( - predict::Function, - modelweights, - x::AbstractArray, - ytrue::AbstractArray; - L1::Real = Cfloat(0), - L2::Real = Cfloat(0), - ) - loss = Knet.nll( - predict( - modelweights, - x; - probabilities = false, - ), - ytrue, - 1, - ) - if L1 != 0 - loss += L1 * sum(sum(abs, w_i) for w_i in modelweights[1:2:end]) - end - if L2 != 0 - loss += L2 * sum(sum(abs2, w_i) for w_i in modelweights[1:2:end]) - end - return loss -end knet_mlp_classifier = PredictMD.load_model(knet_mlp_classifier_filename) +PredictMD.parse_functions!(knet_mlp_classifier) all_models = PredictMD.Fittable[ logistic_classifier, diff --git a/examples/cpu/breast_cancer_biopsy/09_get_model_output.jl b/examples/cpu/breast_cancer_biopsy/09_get_model_output.jl index 2fcac3270..6e57720d0 100644 --- a/examples/cpu/breast_cancer_biopsy/09_get_model_output.jl +++ b/examples/cpu/breast_cancer_biopsy/09_get_model_output.jl @@ -127,50 +127,8 @@ logistic_classifier = PredictMD.load_model(logistic_classifier_filename) random_forest_classifier = PredictMD.load_model(random_forest_classifier_filename) c_svc_svm_classifier = PredictMD.load_model(c_svc_svm_classifier_filename) nu_svc_svm_classifier = PredictMD.load_model(nu_svc_svm_classifier_filename) - -function knetmlp_predict( - w, - x0::AbstractArray; - probabilities::Bool = true, - ) - x1 = Knet.relu.( w[1]*x0 .+ w[2] ) - x2 = Knet.relu.( w[3]*x1 .+ w[4] ) - x3 = w[5]*x2 .+ w[6] - unnormalizedlogprobs = x3 - if probabilities - normalizedlogprobs = Knet.logp(unnormalizedlogprobs, 1) - normalizedprobs = exp.(normalizedlogprobs) - return normalizedprobs - else - return unnormalizedlogprobs - end -end -function knetmlp_loss( - predict::Function, - modelweights, - x::AbstractArray, - ytrue::AbstractArray; - L1::Real = Cfloat(0), - L2::Real = Cfloat(0), - ) - loss = Knet.nll( - predict( - modelweights, - x; - probabilities = false, - ), - ytrue, - 1, - ) - if L1 != 0 - loss += L1 * sum(sum(abs, w_i) for w_i in modelweights[1:2:end]) - end - if L2 != 0 - loss += L2 * sum(sum(abs2, w_i) for w_i in modelweights[1:2:end]) - end - return loss -end knet_mlp_classifier = PredictMD.load_model(knet_mlp_classifier_filename) +PredictMD.parse_functions!(knet_mlp_classifier) PredictMD.predict_proba(logistic_classifier,smoted_training_features_df,) PredictMD.predict_proba(random_forest_classifier,smoted_training_features_df,) diff --git a/src/io/saveload.jl b/src/io/saveload.jl index e96a6d4ee..6c1161955 100644 --- a/src/io/saveload.jl +++ b/src/io/saveload.jl @@ -3,11 +3,25 @@ import FileIO import JLD2 import ProgressMeter +function filename_extension(filename::AbstractString) + result = lowercase(strip(splitext(filename)[2])) + return result +end + """ """ function save_model(filename::AbstractString,fittable_object_to_save::Fittable) - # make sure that the filename ends in ".jld2" - if lowercase(strip(splitext(filename)[2])) != ".jld2" + if filename_extension(filename) == ".jld2" + result = save_model_jld2(filename,fittable_object_to_save) + elseif filename_extension(filename) == ".bson" + result = save_model_bson(filename,fittable_object_to_save) + else + error("extension must be one of: .jld2, .bson") + end +end + +function save_model_jld2(filename::AbstractString,fittable_object_to_save::Fittable) + if filename_extension(filename) != ".jld2" error( string( "Filename \"", @@ -16,7 +30,7 @@ function save_model(filename::AbstractString,fittable_object_to_save::Fittable) ) end dict_of_objects_to_save = Dict( - "saved_model" => fittable_object_to_save, + "jld2_saved_model" => fittable_object_to_save, ) info("INFO Attempting to save model...") # make sure the parent directory exists @@ -28,11 +42,43 @@ function save_model(filename::AbstractString,fittable_object_to_save::Fittable) return nothing end +function save_model_bson(filename::AbstractString,fittable_object_to_save::Fittable) + if filename_extension(filename) != ".bson" + error( + string( + "Filename \"", + filename, + "\" does not end in \".bson\"") + ) + end + dict_of_objects_to_save = Dict( + :bson_saved_model => fittable_object_to_save, + ) + info("INFO Attempting to save model...") + # make sure the parent directory exists + parent_directory = Base.Filesystem.dirname(filename) + Base.Filesystem.mkpath(parent_directory) + # save the .bson file + BSON.bson(filename, dict_of_objects_to_save) + info(string("INFO Saved model to file \"", filename, "\"")) +end + """ """ function load_model(filename::AbstractString) - # make sure that the filename ends in ".jld2" - if lowercase(strip(splitext(filename)[2])) != ".jld2" + if filename_extension(filename) == ".jld2" + result = load_model_jld2(filename) + return result + elseif filename_extension(filename) == ".bson" + result = load_model_bson(filename) + return result + else + error("extension must be one of: .jld2, .bson") + end +end + +function load_model_jld2(filename::AbstractString) + if filename_extension(filename) != ".jld2" error( string( "Filename \"", @@ -42,7 +88,22 @@ function load_model(filename::AbstractString) end info("INFO Attempting to load model...") dict_of_loaded_objects = FileIO.load(filename) - loaded_fittable_object = dict_of_loaded_objects["saved_model"] + loaded_fittable_object = dict_of_loaded_objects["jld2_saved_model"] info(string("INFO Loaded model from file \"", filename, "\"")) return loaded_fittable_object end + +function load_model_bson(filename::AbstractString) + if filename_extension(filename) != ".bson" + error( + string( + "Filename \"", + filename, + "\" does not end in \".bson\"") + ) + end + info("INFO Attempting to load model...") + dict_of_loaded_objects = BSON.load(filename) + loaded_fittable_object = dict_of_loaded_objects[:bson_saved_model] + info(string("INFO Loaded model from file \"", filename, "\"")) +end diff --git a/src/linearmodel/glm.jl b/src/linearmodel/glm.jl index fb23a8c95..7c08e15fb 100644 --- a/src/linearmodel/glm.jl +++ b/src/linearmodel/glm.jl @@ -68,6 +68,12 @@ function get_underlying( return result end +""" +""" +function parse_functions!(estimator::GLMModel) + return nothing +end + """ """ function fit!( diff --git a/src/neuralnetwork/knet.jl b/src/neuralnetwork/knet.jl index 5015125d2..ee41ddfda 100644 --- a/src/neuralnetwork/knet.jl +++ b/src/neuralnetwork/knet.jl @@ -10,27 +10,31 @@ mutable struct KnetModel <: AbstractEstimator isregressionmodel::T3 where T3 <: Bool # hyperparameters (not learned from data): - predict::T4 where T4 <: Function - loss::T5 where T5 <: Function - losshyperparameters::T6 where T6 <: Associative - optimizationalgorithm::T7 where T7 <: Symbol - optimizerhyperparameters::T8 where T8 <: Associative - minibatchsize::T9 where T9 <: Integer - maxepochs::T10 where T10 <: Integer - printlosseverynepochs::T11 where T11 <: Integer + predict_function_source::T4 where T4 <: AbstractString + loss_function_source::T5 where T5 <: AbstractString + predict_function::T6 where T6 <: Union{Void, Function, Any} + loss_function::T7 where T7 <: Union{Void, Function, Any} + losshyperparameters::T8 where T8 <: Associative + optimizationalgorithm::T9 where T9 <: Symbol + optimizerhyperparameters::T10 where T10 <: Associative + minibatchsize::T11 where T11 <: Integer + maxepochs::T12 where T12 <: Integer + printlosseverynepochs::T13 where T13 <: Integer # parameters (learned from data): - modelweights::T12 where T12 <: AbstractArray - modelweightoptimizers::T13 where T13 <: Any # TODO: do something better here + modelweights::T14 where T14 <: AbstractArray + modelweightoptimizers::T15 where T15 <: Any # learning state - history::T where T <: ValueHistories.MultivalueHistory + history::T16 where T16 <: ValueHistories.MultivalueHistory function KnetModel( ; name::AbstractString = "", - predict::Function = () -> (), - loss::Function =() -> (), + predict_function_source::AbstractString = "", + loss_function_source::AbstractString = "", + predict_function::Function = identity, + loss_function::Function = identity, losshyperparameters::Associative = Dict(), optimizationalgorithm::Symbol = :nothing, optimizerhyperparameters::Associative = Dict(), @@ -70,8 +74,10 @@ mutable struct KnetModel <: AbstractEstimator name, isclassificationmodel, isregressionmodel, - predict, - loss, + predict_function_source, + loss_function_source, + predict_function, + loss_function, losshyperparameters, optimizationalgorithm, optimizerhyperparameters, @@ -117,6 +123,24 @@ function get_history( return result end +function parse_functions!(estimator::KnetModel) + estimator.predict_function = eval( + parse( + strip( + estimator.predict_function_source + ) + ) + ) + estimator.loss_function = eval( + parse( + strip( + estimator.loss_function_source + ) + ) + ) + return nothing +end + """ """ function fit!( @@ -150,8 +174,8 @@ function fit!( training_labels_array, estimator.minibatchsize, ) - loss_gradient = Knet.grad( - estimator.loss, + loss_function_gradient = Knet.grad( + estimator.loss_function, 2, ) all_iterations_so_far, all_epochs_so_far = ValueHistories.get( @@ -167,16 +191,16 @@ function fit!( ".", ) ) - training_lossbeforetrainingstarts = estimator.loss( - estimator.predict, + training_lossbeforetrainingstarts = estimator.loss_function( + estimator.predict_function, estimator.modelweights, training_features_array, training_labels_array; estimator.losshyperparameters... ) if has_validation_data - validation_lossbeforetrainingstarts = estimator.loss( - estimator.predict, + validation_lossbeforetrainingstarts = estimator.loss_function( + estimator.predict_function, estimator.modelweights, validation_features_array, validation_labels_array; @@ -210,8 +234,8 @@ function fit!( end while last_epoch < estimator.maxepochs for (x_training, y_training) in training_data - grads = loss_gradient( - estimator.predict, + grads = loss_function_gradient( + estimator.predict_function, estimator.modelweights, x_training, y_training; @@ -223,8 +247,8 @@ function fit!( estimator.modelweightoptimizers, ) last_iteration += 1 - training_currentiterationloss = estimator.loss( - estimator.predict, + training_currentiterationloss = estimator.loss_function( + estimator.predict_function, estimator.modelweights, x_training, y_training; @@ -244,8 +268,8 @@ function fit!( last_iteration, last_epoch, ) - training_currentepochloss = estimator.loss( - estimator.predict, + training_currentepochloss = estimator.loss_function( + estimator.predict_function, estimator.modelweights, training_features_array, training_labels_array; @@ -258,8 +282,8 @@ function fit!( training_currentepochloss, ) if has_validation_data - validation_currentepochloss = estimator.loss( - estimator.predict, + validation_currentepochloss = estimator.loss_function( + estimator.predict_function, estimator.modelweights, validation_features_array, validation_labels_array; @@ -322,7 +346,7 @@ function predict( ) return predictionsvector elseif estimator.isregressionmodel - output = estimator.predict( + output = estimator.predict_function( estimator.modelweights, featuresarray; ) @@ -341,7 +365,7 @@ function predict_proba( featuresarray::AbstractArray, ) if estimator.isclassificationmodel - output = estimator.predict( + output = estimator.predict_function( estimator.modelweights, featuresarray; probabilities = true, @@ -367,8 +391,8 @@ function _singlelabelmulticlassdataframeknetclassifier_Knet( singlelabelname::Symbol, singlelabellevels::AbstractVector; name::AbstractString = "", - predict::Function = () -> (), - loss::Function = () -> (), + predict_function_source::AbstractString = "", + loss_function_source::AbstractString = "", losshyperparameters::Associative = Dict(), optimizationalgorithm::Symbol = :nothing, optimizerhyperparameters::Associative = Dict(), @@ -396,8 +420,8 @@ function _singlelabelmulticlassdataframeknetclassifier_Knet( knetestimator = KnetModel( ; name = name, - predict = predict, - loss = loss, + predict_function_source = predict_function_source, + loss_function_source = loss_function_source, losshyperparameters = losshyperparameters, optimizationalgorithm = optimizationalgorithm, optimizerhyperparameters = optimizerhyperparameters, @@ -408,11 +432,11 @@ function _singlelabelmulticlassdataframeknetclassifier_Knet( maxepochs = maxepochs, printlosseverynepochs = printlosseverynepochs, ) - predprobalabelfixer = ImmutablePredictProbaSingleLabelInt2StringTransformer( + predprobalabelfixer = ImmutablePredictProbaSingleLabelInTStringTransformer( 1, singlelabellevels ) - predictlabelfixer = ImmutablePredictionsSingleLabelInt2StringTransformer( + predictlabelfixer = ImmutablePredictionsSingleLabelInTStringTransformer( 1, singlelabellevels ) @@ -447,8 +471,8 @@ function singlelabelmulticlassdataframeknetclassifier( singlelabellevels::AbstractVector; package::Symbol = :none, name::AbstractString = "", - predict::Function = () -> (), - loss::Function =() -> (), + predict_function_source::AbstractString = "", + loss_function_source::AbstractString = "", losshyperparameters::Associative = Dict(), optimizationalgorithm::Symbol = :nothing, optimizerhyperparameters::Associative = Dict(), @@ -464,8 +488,8 @@ function singlelabelmulticlassdataframeknetclassifier( singlelabelname, singlelabellevels; name = name, - predict = predict, - loss = loss, + predict_function_source = predict_function_source, + loss_function_source = loss_function_source, losshyperparameters = losshyperparameters, optimizationalgorithm = optimizationalgorithm, optimizerhyperparameters = optimizerhyperparameters, @@ -487,8 +511,8 @@ function _singlelabeldataframeknetregression_Knet( featurenames::AbstractVector, singlelabelname::Symbol; name::AbstractString = "", - predict::Function = () -> (), - loss::Function =() -> (), + predict_function_source::AbstractString = "", + loss_function_source::AbstractString = "", losshyperparameters::Associative = Dict(), optimizationalgorithm::Symbol = :nothing, optimizerhyperparameters::Associative = Dict(), @@ -511,8 +535,8 @@ function _singlelabeldataframeknetregression_Knet( knetestimator = KnetModel( ; name = name, - predict = predict, - loss = loss, + predict_function_source = predict_function_source, + loss_function_source = loss_function_source, losshyperparameters = losshyperparameters, optimizationalgorithm = optimizationalgorithm, optimizerhyperparameters = optimizerhyperparameters, @@ -547,8 +571,8 @@ function singlelabeldataframeknetregression( singlelabelname::Symbol; package::Symbol = :none, name::AbstractString = "", - predict::Function = () -> (), - loss::Function =() -> (), + predict_function_source::AbstractString = "", + loss_function_source::AbstractString = "", losshyperparameters::Associative = Dict(), optimizationalgorithm::Symbol = :nothing, optimizerhyperparameters::Associative = Dict(), @@ -563,8 +587,8 @@ function singlelabeldataframeknetregression( featurenames, singlelabelname; name = name, - predict = predict, - loss = loss, + predict_function_source = predict_function_source, + loss_function_source = loss_function_source, losshyperparameters = losshyperparameters, optimizationalgorithm = optimizationalgorithm, optimizerhyperparameters = optimizerhyperparameters, diff --git a/src/pipeline/simplelinearpipeline.jl b/src/pipeline/simplelinearpipeline.jl index df724c712..b60677e60 100644 --- a/src/pipeline/simplelinearpipeline.jl +++ b/src/pipeline/simplelinearpipeline.jl @@ -84,6 +84,15 @@ function get_history( return history end +""" +""" +function parse_functions!(simplelinearpipeline::SimplePipeline) + for i = 1:length(simplelinearpipeline.objectsvector) + parse_functions!(simplelinearpipeline.objectsvector[i]) + end + return nothing +end + """ """ function fit!( diff --git a/src/postprocessing/packagemultilabelpred.jl b/src/postprocessing/packagemultilabelpred.jl index 7492edf0c..d1453520d 100644 --- a/src/postprocessing/packagemultilabelpred.jl +++ b/src/postprocessing/packagemultilabelpred.jl @@ -34,6 +34,12 @@ function get_history( return nothing end +""" +""" +function parse_functions!(transformer::ImmutablePackageMultiLabelPredictionTransformer) + return nothing +end + """ """ function fit!( diff --git a/src/postprocessing/packagesinglelabelpred.jl b/src/postprocessing/packagesinglelabelpred.jl index 8fccf64bb..1f0b6ce3d 100644 --- a/src/postprocessing/packagesinglelabelpred.jl +++ b/src/postprocessing/packagesinglelabelpred.jl @@ -34,6 +34,12 @@ function get_history( return nothing end +""" +""" +function parse_functions!(transformer::ImmutablePackageSingleLabelPredictionTransformer) + return nothing +end + """ """ function fit!( diff --git a/src/postprocessing/packagesinglelabelproba.jl b/src/postprocessing/packagesinglelabelproba.jl index 807b0de73..fe77a6db1 100644 --- a/src/postprocessing/packagesinglelabelproba.jl +++ b/src/postprocessing/packagesinglelabelproba.jl @@ -34,6 +34,12 @@ function get_history( return nothing end +""" +""" +function parse_functions!(transformer::ImmutablePackageSingleLabelPredictProbaTransformer) + return nothing +end + """ """ function fit!( diff --git a/src/postprocessing/predictoutput.jl b/src/postprocessing/predictoutput.jl index c11c8c515..336e81e83 100644 --- a/src/postprocessing/predictoutput.jl +++ b/src/postprocessing/predictoutput.jl @@ -37,6 +37,12 @@ function get_history( return nothing end +""" +""" +function parse_functions!(transformer::ImmutablePredictionsSingleLabelInt2StringTransformer) + return nothing +end + """ """ function fit!( diff --git a/src/postprocessing/predictprobaoutput.jl b/src/postprocessing/predictprobaoutput.jl index 89a018968..edeed6f50 100644 --- a/src/postprocessing/predictprobaoutput.jl +++ b/src/postprocessing/predictprobaoutput.jl @@ -35,6 +35,12 @@ function get_history( return nothing end +""" +""" +function parse_functions!(transformer::ImmutablePredictProbaSingleLabelInt2StringTransformer) + return nothing +end + """ """ function fit!( diff --git a/src/preprocessing/dataframetodecisiontree.jl b/src/preprocessing/dataframetodecisiontree.jl index 92093b1ac..0f7efa52c 100644 --- a/src/preprocessing/dataframetodecisiontree.jl +++ b/src/preprocessing/dataframetodecisiontree.jl @@ -101,6 +101,12 @@ function transform( return featuresarray end +""" +""" +function parse_functions!(transformer::MutableDataFrame2DecisionTreeTransformer) + return nothing +end + """ """ function fit!( diff --git a/src/preprocessing/dataframetoglm.jl b/src/preprocessing/dataframetoglm.jl index 263763795..e66331375 100644 --- a/src/preprocessing/dataframetoglm.jl +++ b/src/preprocessing/dataframetoglm.jl @@ -64,6 +64,12 @@ function transform( return features_df end +""" +""" +function parse_functions!(transformer::ImmutableDataFrame2GLMSingleLabelBinaryClassTransformer) + return nothing +end + """ """ function fit!( diff --git a/src/preprocessing/dataframetoknet.jl b/src/preprocessing/dataframetoknet.jl index ffd7b90b7..2ad824b68 100644 --- a/src/preprocessing/dataframetoknet.jl +++ b/src/preprocessing/dataframetoknet.jl @@ -119,6 +119,18 @@ function set_feature_contrasts!( return nothing end +""" +""" +function parse_functions!(transformer::MutableDataFrame2ClassificationKnetTransformer) + return nothing +end + +""" +""" +function parse_functions!(transformer::MutableDataFrame2RegressionKnetTransformer,) + return nothing +end + """ """ function fit!( diff --git a/src/preprocessing/dataframetosvm.jl b/src/preprocessing/dataframetosvm.jl index 4dc617492..7daa12a10 100644 --- a/src/preprocessing/dataframetosvm.jl +++ b/src/preprocessing/dataframetosvm.jl @@ -59,6 +59,12 @@ function transform( return featuresarraytransposed end +""" +""" +function parse_functions!(transformer::ImmutableFeatureArrayTransposerTransformer) + return nothing +end + """ """ function fit!( diff --git a/src/svm/libsvm.jl b/src/svm/libsvm.jl index 9a583c47d..513724fc2 100644 --- a/src/svm/libsvm.jl +++ b/src/svm/libsvm.jl @@ -94,6 +94,12 @@ function get_history( return nothing end +""" +""" +function parse_functions!(estimator::LIBSVMModel) + return nothing +end + """ """ function fit!( diff --git a/src/tree/decisiontree.jl b/src/tree/decisiontree.jl index 9598832d4..277612c2a 100644 --- a/src/tree/decisiontree.jl +++ b/src/tree/decisiontree.jl @@ -80,6 +80,12 @@ function get_history( return nothing end +""" +""" +function parse_functions!(estimator::DecisionTreeModel) + return nothing +end + """ """ function fit!( From 779dd5e39bfe653f9e35ec085d47351c740a3035 Mon Sep 17 00:00:00 2001 From: Dilum Aluthge Date: Tue, 22 May 2018 20:16:34 -0400 Subject: [PATCH 54/62] Fix bugs --- docs/deploy_docs.jl | 4 ++++ docs/make_docs.jl | 5 +++++ src/neuralnetwork/knet.jl | 4 ++-- src/utils/openbrowserwindow.jl | 15 ++++++++++++--- src/utils/runtestsenv.jl | 16 +++++++++++++++- test/runtests.jl | 4 +++- 6 files changed, 41 insertions(+), 7 deletions(-) diff --git a/docs/deploy_docs.jl b/docs/deploy_docs.jl index 3c70459fe..490e75efa 100644 --- a/docs/deploy_docs.jl +++ b/docs/deploy_docs.jl @@ -1,6 +1,8 @@ import Documenter import PredictMD +ENV["PREDICTMD_IS_DEPLOY_DOCS"] = "true" + Documenter.deploydocs( branch = "gh-pages", deps = Documenter.Deps.pip( @@ -14,3 +16,5 @@ Documenter.deploydocs( repo = "github.com/bcbi/PredictMD.jl.git", target = "site", ) + +ENV["PREDICTMD_IS_DEPLOY_DOCS"] = "false" diff --git a/docs/make_docs.jl b/docs/make_docs.jl index b73d991d6..68cc8b6f6 100644 --- a/docs/make_docs.jl +++ b/docs/make_docs.jl @@ -2,6 +2,8 @@ import Documenter import Literate import PredictMD +ENV["PREDICTMD_IS_MAKE_DOCS"] = "true" + info("DEBUG: using Literate.jl to generate examples") examples_input_parent_directory = joinpath( @@ -330,3 +332,6 @@ Documenter.makedocs( "library/internals.md", ], ) + + +ENV["PREDICTMD_IS_MAKE_DOCS"] = "false" diff --git a/src/neuralnetwork/knet.jl b/src/neuralnetwork/knet.jl index ee41ddfda..9bb37be0c 100644 --- a/src/neuralnetwork/knet.jl +++ b/src/neuralnetwork/knet.jl @@ -432,11 +432,11 @@ function _singlelabelmulticlassdataframeknetclassifier_Knet( maxepochs = maxepochs, printlosseverynepochs = printlosseverynepochs, ) - predprobalabelfixer = ImmutablePredictProbaSingleLabelInTStringTransformer( + predprobalabelfixer = ImmutablePredictProbaSingleLabelInt2StringTransformer( 1, singlelabellevels ) - predictlabelfixer = ImmutablePredictionsSingleLabelInTStringTransformer( + predictlabelfixer = ImmutablePredictionsSingleLabelInt2StringTransformer( 1, singlelabellevels ) diff --git a/src/utils/openbrowserwindow.jl b/src/utils/openbrowserwindow.jl index 435182e17..c549eb7c3 100644 --- a/src/utils/openbrowserwindow.jl +++ b/src/utils/openbrowserwindow.jl @@ -2,13 +2,22 @@ """ """ -function open_browser_window(filename::AbstractString) - if is_travis_ci(ENV) +function open_browser_window( + filename::AbstractString, + env_dict::Associative = ENV, + ) + if is_travis_ci(env_dict) info(string("DEBUG Skipping opening file during Travis build: ",filename,)) return nothing - elseif is_runtests(ENV) && !open_plots_during_tests(ENV) + elseif is_runtests(env_dict) && !open_plots_during_tests(env_dict) info(string("DEBUG Skipping opening file during package tests: ",filename,)) return nothing + elseif is_make_docs(env_dict) + info(string("DEBUG Skipping opening file during make_docs: ",filename,)) + return nothing + elseif is_deploy_docs(env_dict) + info(string("DEBUG Skipping opening file during deploy_docs: ",filename,)) + return nothing else info(string("DEBUG Opening file ",filename,)) if is_apple() diff --git a/src/utils/runtestsenv.jl b/src/utils/runtestsenv.jl index 316c096f2..a5079b640 100644 --- a/src/utils/runtestsenv.jl +++ b/src/utils/runtestsenv.jl @@ -1,6 +1,20 @@ """ """ function is_runtests(a::Associative) - result = lowercase(get(a, "PREDICTMD_RUNTESTS", "")) == lowercase("true") + result = lowercase(get(a, "PREDICTMD_IS_RUNTESTS", "")) == lowercase("true") + return result +end + +""" +""" +function is_make_docs(a::Associative) + result = lowercase(get(a, "PREDICTMD_IS_MAKE_DOCS", "")) == lowercase("true") + return result +end + +""" +""" +function is_deploy_docs(a::Associative) + result = lowercase(get(a, "PREDICTMD_IS_DEPLOY_DOCS", "")) == lowercase("true") return result end diff --git a/test/runtests.jl b/test/runtests.jl index f82720913..ebbe6c413 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -9,7 +9,7 @@ info("INFO Successfully imported PredictMD") info("INFO Printing PredictMD version info:") println(string("PredictMD Version ", PredictMD.VERSION)) -ENV["PREDICTMD_RUNTESTS"] = "true" +ENV["PREDICTMD_IS_RUNTESTS"] = "true" Base.Test.@testset "PredictMD test suite" begin Base.Test.@testset "Unit tests (CPU)" begin @@ -53,3 +53,5 @@ Base.Test.@testset "PredictMD test suite" begin end end end + +ENV["PREDICTMD_IS_RUNTESTS"] = "false" From 3dc428d00d823dc0cbc1d3f9f5387df31a3f2850 Mon Sep 17 00:00:00 2001 From: Dilum Aluthge Date: Tue, 22 May 2018 21:24:35 -0400 Subject: [PATCH 55/62] Various fixes --- docs/mkdocs.yml | 23 ++++++++++++++++++++--- docs/src/examples.md | 0 2 files changed, 20 insertions(+), 3 deletions(-) delete mode 100644 docs/src/examples.md diff --git a/docs/mkdocs.yml b/docs/mkdocs.yml index 453cd9ee5..ed3593855 100644 --- a/docs/mkdocs.yml +++ b/docs/mkdocs.yml @@ -21,7 +21,24 @@ markdown_extensions: docs_dir: 'build' pages: - - Home: index.md - - Examples: examples.md - - Library: + - 'Home': index.md + - 'Examples': + - 'Boston housing (single label regression)': + - '1. Preprocess data': 'examples/cpu/boston_housing/01_preprocess_data.md' + - '2. Linear regression': 'examples/cpu/boston_housing/02_linear_regression.md' + - '3. Random forest regression': 'examples/cpu/boston_housing/03_random_forest_regression.md' + - '4. Knet neural network regression': 'examples/cpu/boston_housing/04_knet_mlp_regression.md' + - '5. Compare models': 'examples/cpu/boston_housing/05_compare_models.md' + - '6. Directly access model output': 'examples/cpu/boston_housing/06_get_model_output.md' + - 'Breast cancer biopsy (single label binary classification)': + - '1. Preprocess data': 'examples/cpu/breast_cancer_biopsy/01_preprocess_data.md' + - '2. Apply SMOTE algorithm': 'examples/cpu/breast_cancer_biopsy/02_smote.md' + - '3. Logistic classifier': 'examples/cpu/breast_cancer_biopsy/03_logistic_classifier.md' + - '4. Random forest classifier': 'examples/cpu/breast_cancer_biopsy/04_random_forest_classifier.md' + - '5. C-SVC support vector machine classifier': 'examples/cpu/breast_cancer_biopsy/05_c_svc_svm_classifier.md' + - '6. nu-SVC support vector machine classifier': 'examples/cpu/breast_cancer_biopsy/06_nu_svc_svm_classifier.md' + - '7. Knet neural network classifier': 'examples/cpu/breast_cancer_biopsy/07_knet_mlp_classifier.md' + - '8. Compare models': 'examples/cpu/breast_cancer_biopsy/08_compare_models.md' + - '9. Directly access model output': 'examples/cpu/breast_cancer_biopsy/09_get_model_output.md' + - 'Library': - 'Internals': 'library/internals.md' diff --git a/docs/src/examples.md b/docs/src/examples.md deleted file mode 100644 index e69de29bb..000000000 From c32980b6200234a109b564ca1411ce0052ae8f25 Mon Sep 17 00:00:00 2001 From: Dilum Aluthge Date: Tue, 22 May 2018 21:25:24 -0400 Subject: [PATCH 56/62] Update .gitignore --- .gitignore | 1 - 1 file changed, 1 deletion(-) diff --git a/.gitignore b/.gitignore index 57e474a1d..28430018e 100644 --- a/.gitignore +++ b/.gitignore @@ -3,7 +3,6 @@ *.ipynb_checkpoints/ data/ docs/build/ -docs/generated/ docs/site/ docs/src/examples/ input/ From 4a89959c0a96a284cb4df06b039720d5ee7642ea Mon Sep 17 00:00:00 2001 From: Dilum Aluthge Date: Tue, 22 May 2018 21:52:38 -0400 Subject: [PATCH 57/62] Progress commit --- .travis.yml | 71 +++++++++++++++++++---------------- docs/src/library/internals.md | 10 ++--- test/runtests.jl | 2 + 3 files changed, 45 insertions(+), 38 deletions(-) diff --git a/.travis.yml b/.travis.yml index 618f5961a..e7ce590b7 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,44 +1,49 @@ -## Documentation: http://docs.travis-ci.com/user/languages/julia/ language: julia + os: - - linux - - osx + - linux + - osx + julia: - - 0.6 -# - nightly + - 0.6 + - nightly + notifications: - email: false + email: false + git: - depth: 99999999 + depth: 99999999 -## Allow failures on nightly julia: matrix: - allow_failures: - - julia: nightly + allow_failures: + - julia: nightly addons: - apt: - packages: - - gfortran - - pdf2svg - - pgf - - poppler-utils # provides /usr/bin/pdftoppm - - texlive-binaries - - texlive-latex-base - - texlive-latex-extra - - texlive-pictures + apt: + packages: + - gfortran + - pdf2svg + - pgf + - poppler-utils + - texlive-binaries + - texlive-latex-base + - texlive-latex-extra + - texlive-pictures before_install: - # update homebrew - - if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then brew update; fi - # install mactex - - if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then brew cask install basictex; fi - - if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then export PATH=/Library/TeX/texbin:"$PATH"; fi - # update tlmgr - - if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then sudo tlmgr update --self; fi - # install tex packages - - if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then sudo tlmgr install luatex85; fi - - if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then sudo tlmgr install pgfplots; fi - - if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then sudo tlmgr install standalone; fi - # install pdf2svg - - if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then brew install pdf2svg; fi + - if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then brew update; fi + - if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then brew cask install basictex; fi + - if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then export PATH=/Library/TeX/texbin:"$PATH"; fi + - if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then sudo tlmgr update --self; fi + - if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then sudo tlmgr install luatex85; fi + - if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then sudo tlmgr install pgfplots; fi + - if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then sudo tlmgr install standalone; fi + - if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then brew install pdf2svg; fi + +script: + - julia --color=yes -e 'Pkg.clone(pwd())' + - julia --color=yes -e "Pkg.build(\"PredictMD\")" + - julia --check-bounds=yes --color=yes -e "Pkg.test(\"PredictMD\", coverage=true)" + +after_success: + - julia --check-bounds=yes --color=yes -e "include(joinpath(Pkg.dir(\"PredictMD\"), \"docs\", \"deploy_docs.jl\"))" diff --git a/docs/src/library/internals.md b/docs/src/library/internals.md index bff731afd..8ff4e29e3 100644 --- a/docs/src/library/internals.md +++ b/docs/src/library/internals.md @@ -7,35 +7,35 @@ Pages = ["internals.md"] ## Modules ```@autodocs -Modules = [PredictMD] +Modules = [PredictMD, PredictMD.GPU] Order = [:module] ``` ## Constants ```@autodocs -Modules = [PredictMD] +Modules = [PredictMD, PredictMD.GPU] Order = [:constant] ``` ## Types ```@autodocs -Modules = [PredictMD] +Modules = [PredictMD, PredictMD.GPU] Order = [:type] ``` ## Functions ```@autodocs -Modules = [PredictMD] +Modules = [PredictMD, PredictMD.GPU] Order = [:function] ``` ## Macros ```@autodocs -Modules = [PredictMD] +Modules = [PredictMD, PredictMD.GPU] Order = [:macro] ``` diff --git a/test/runtests.jl b/test/runtests.jl index ebbe6c413..4a0b1cb51 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -33,6 +33,7 @@ Base.Test.@testset "PredictMD test suite" begin Base.Test.@testset "Test examples (CPU)" begin info("INFO testing examples (CPU)") Base.Test.@testset "Boston housing regression" begin + info("INFO testing Boston housing regression") include("../docs/src/examples/cpu/boston_housing/01_preprocess_data.jl") include("../docs/src/examples/cpu/boston_housing/02_linear_regression.jl") include("../docs/src/examples/cpu/boston_housing/03_random_forest_regression.jl") @@ -41,6 +42,7 @@ Base.Test.@testset "PredictMD test suite" begin include("../docs/src/examples/cpu/boston_housing/06_get_model_output.jl") end Base.Test.@testset "Breast cancer biopsy classification" begin + info("INFO testing breast cancer biopsy classification") include("../docs/src/examples/cpu/breast_cancer_biopsy/01_preprocess_data.jl") include("../docs/src/examples/cpu/breast_cancer_biopsy/02_smote.jl") include("../docs/src/examples/cpu/breast_cancer_biopsy/03_logistic_classifier.jl") From 0ced184e915c5eb3f6ba9fb5953b679551a36e0c Mon Sep 17 00:00:00 2001 From: Dilum Aluthge Date: Tue, 22 May 2018 22:21:36 -0400 Subject: [PATCH 58/62] Progress commit. --- .travis.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.travis.yml b/.travis.yml index e7ce590b7..7c3b2ae5f 100644 --- a/.travis.yml +++ b/.travis.yml @@ -46,4 +46,5 @@ script: - julia --check-bounds=yes --color=yes -e "Pkg.test(\"PredictMD\", coverage=true)" after_success: - - julia --check-bounds=yes --color=yes -e "include(joinpath(Pkg.dir(\"PredictMD\"), \"docs\", \"deploy_docs.jl\"))" + - cd docs + - julia --check-bounds=yes --color=yes deploy_docs.jl From e693c2db81c2504e8f4d20f2b32baa7a688d49e0 Mon Sep 17 00:00:00 2001 From: Dilum Aluthge Date: Tue, 22 May 2018 23:40:16 -0400 Subject: [PATCH 59/62] Set up Travis to build and deploy docs --- .travis.yml | 7 +++---- docs/deploy_docs.jl | 4 ++-- docs/mkdocs.yml | 2 +- 3 files changed, 6 insertions(+), 7 deletions(-) diff --git a/.travis.yml b/.travis.yml index 7c3b2ae5f..455f0fdfd 100644 --- a/.travis.yml +++ b/.travis.yml @@ -42,9 +42,8 @@ before_install: script: - julia --color=yes -e 'Pkg.clone(pwd())' - - julia --color=yes -e "Pkg.build(\"PredictMD\")" - - julia --check-bounds=yes --color=yes -e "Pkg.test(\"PredictMD\", coverage=true)" + - julia --color=yes -e 'Pkg.build("PredictMD")' + - julia --check-bounds=yes --color=yes -e 'Pkg.test("PredictMD", coverage=true)' after_success: - - cd docs - - julia --check-bounds=yes --color=yes deploy_docs.jl + - julia -e 'cd(Pkg.dir("PredictMD")); include(joinpath("docs", "deploy_docs.jl"))' diff --git a/docs/deploy_docs.jl b/docs/deploy_docs.jl index 490e75efa..81ecab258 100644 --- a/docs/deploy_docs.jl +++ b/docs/deploy_docs.jl @@ -6,12 +6,12 @@ ENV["PREDICTMD_IS_DEPLOY_DOCS"] = "true" Documenter.deploydocs( branch = "gh-pages", deps = Documenter.Deps.pip( - "pygments", "mkdocs", + "pygments", "python-markdown-math", ), julia = "0.6", - latest = "develop", # latest = develop branch + latest = "develop", osname = "linux", repo = "github.com/bcbi/PredictMD.jl.git", target = "site", diff --git a/docs/mkdocs.yml b/docs/mkdocs.yml index ed3593855..39e7afef8 100644 --- a/docs/mkdocs.yml +++ b/docs/mkdocs.yml @@ -14,9 +14,9 @@ extra_javascript: markdown_extensions: - extra - - tables - fenced_code - mdx_math + - tables docs_dir: 'build' From 39cb2ab7c24ff16270be1bf26ae8fd2833953dd6 Mon Sep 17 00:00:00 2001 From: Dilum Aluthge Date: Wed, 23 May 2018 02:19:52 -0400 Subject: [PATCH 60/62] Add Documenter.jl badges to README --- README.md | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/README.md b/README.md index 3efae6536..a5878b2b9 100644 --- a/README.md +++ b/README.md @@ -3,15 +3,22 @@ + + + + + + +
branch master develop
travis Build Status (master) Build Status (develop)
docsDocumentation (stable)Documentation (latest)
From e78b93f8c388dc71c0508e3603997cdc1b674502 Mon Sep 17 00:00:00 2001 From: Dilum Aluthge Date: Wed, 23 May 2018 02:37:50 -0400 Subject: [PATCH 61/62] Remove outdated content from CONTRIBUTING.md (this content is no longer relevant now that we use Literate.jl to generate notebooks) --- CONTRIBUTING.md | 41 ----------------------------------------- 1 file changed, 41 deletions(-) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 44d9afebf..896128094 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -14,9 +14,6 @@ This document provides information on contributing to the PredictMD source code. 2. Setting up the PredictMD repo - - - 3. Working with examples @@ -156,41 +153,3 @@ cd ~/.julia/v0.6/PredictMD ```bash git config commit.gpgsign true && git remote set-url origin https://github.com/bcbi/PredictMD.jl.git && git remote set-url --push origin git@github.com:bcbi/PredictMD.jl.git && git checkout master && git checkout develop && git flow init -fd && git checkout develop && git fetch --all --prune ``` - -## 3. Working with examples - -Some of the examples are provided as Jupyter notebooks as a convinient way to visualize and interact with the code. However, we also like to mantain corresponding plain Julia scripts that are in-sync with the notebooks. A convinient way to do so, is to add a post-save hook to your Jupyter configuration file. - -1. Open you jupyter configuration file ~/.jupyter/jupyter_notebook_config.py. If the file does not exist you can generate it by running `jupyter notebook --generate-config` - -2. Add the following code to the top of the file - -```python -#----------------------------------------------------------------------------- -# Auto save script version of notebook -# Reference: https://svds.com/jupyter-notebook-best-practices-for-data-science/ -#----------------------------------------------------------------------------- - -import os -from subprocess import check_call - -def post_save(model, os_path, contents_manager): - """post-save hook for converting notebooks to .py scripts""" - if model['type'] != 'notebook': - return # only do this for notebooks - d, fname = os.path.split(os_path) - check_call(['jupyter', 'nbconvert', '--to', 'script', fname], cwd=d) - -c.FileContentsManager.post_save_hook = post_save -``` - -**Note:** This behavior is global. If you want to have this saving only when in a particular folder, you can create multiple configuration files as a work-around. First create a new profile name via a bash command line: -```bash -export JUPYTER_CONFIG_DIR=~/.jupyter_profile2 -jupyter notebook --generate-config -``` -This will create a new directory and file at `~/.jupyter_profile2/jupyter_notebook_config.py` Then run jupyter notebook and work as usual. To switch back to your default profile you will have to set (either by hand, shell function, or your .bashrc) back to: - -```bash -export JUPYTER_CONFIG_DIR=~/.jupyter -``` From 02cf9d8dcaed6195b2ccb0212893362438e55b35 Mon Sep 17 00:00:00 2001 From: Dilum Aluthge Date: Thu, 24 May 2018 03:27:49 -0400 Subject: [PATCH 62/62] Bump version number from "v0.13.0-DEV" to "v0.13.0" --- src/base/version.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/base/version.jl b/src/base/version.jl index 2425250d3..c22f9fb49 100644 --- a/src/base/version.jl +++ b/src/base/version.jl @@ -1,5 +1,5 @@ const VERSION = try - convert(VersionNumber, "v0.13.0-DEV") + convert(VersionNumber, "v0.13.0") catch e warn("WARN While creating PredictMD.VERSION, ignoring error $(e)") VersionNumber(0)