-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathFAGE_Analysis_Pipeline.Rmd
1396 lines (1072 loc) · 70.8 KB
/
FAGE_Analysis_Pipeline.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
---
title: "FAGE Analysis Pipeline (ver 0.6.2)"
author:
- Khaled Al-Shamaa, Author and Maintainer
- Salvador Gezan, Contributor
- Zakaria Kehel, Contributor
- ICARDA, Copyright Holder
date: '`r format(Sys.time(), "At %l:%M %p on %A, %e %B, %Y")`'
output:
html_document:
number_sections: TRUE
code_folding: hide
toc: TRUE
toc_float: TRUE
toc_depth: 4
mathjax: local
self_contained: false
---
<a href="https://github.com/icarda-git/FAGE/" target="_blank" class="github-corner" aria-label="View source on GitHub"><svg width="80" height="80" viewBox="0 0 250 250" style="fill:#151513; color:#fff; position: absolute; top: 0; border: 0; right: 0;" aria-hidden="true"><path d="M0,0 L115,115 L130,115 L142,142 L250,250 L250,0 Z"></path><path d="M128.3,109.0 C113.8,99.7 119.0,89.6 119.0,89.6 C122.0,82.7 120.5,78.6 120.5,78.6 C119.2,72.0 123.4,76.3 123.4,76.3 C127.3,80.9 125.5,87.3 125.5,87.3 C122.9,97.6 130.6,101.9 134.4,103.2" fill="currentColor" style="transform-origin: 130px 106px;" class="octo-arm"></path><path d="M115.0,115.0 C114.9,115.1 118.7,116.5 119.8,115.4 L133.7,101.6 C136.9,99.2 139.9,98.4 142.2,98.6 C133.8,88.0 127.5,74.4 143.8,58.0 C148.5,53.4 154.0,51.2 159.7,51.0 C160.3,49.4 163.2,43.6 171.4,40.1 C171.4,40.1 176.1,42.5 178.8,56.2 C183.1,58.6 187.2,61.8 190.9,65.4 C194.5,69.0 197.7,73.2 200.1,77.6 C213.8,80.2 216.3,84.9 216.3,84.9 C212.7,93.1 206.9,96.0 205.4,96.6 C205.1,102.4 203.0,107.8 198.3,112.5 C181.9,128.9 168.3,122.5 157.7,114.1 C157.9,116.9 156.7,120.9 152.7,124.9 L141.0,136.5 C139.8,137.7 141.6,141.9 141.8,141.8 Z" fill="currentColor" class="octo-body"></path></svg></a><style>.github-corner:hover .octo-arm{animation:octocat-wave 560ms ease-in-out}@keyframes octocat-wave{0%,100%{transform:rotate(0)}20%,60%{transform:rotate(-25deg)}40%,80%{transform:rotate(10deg)}}@media (max-width:500px){.github-corner:hover .octo-arm{animation:none}.github-corner .octo-arm{animation:octocat-wave 560ms ease-in-out}}</style>
<style> body {text-align: justify} </style>
```{r knitr_setup, include = FALSE}
knitr::opts_chunk$set(echo = TRUE)
# get pretty errors, warnings and messages in R Markdown
knitr::knit_hooks$set(
error = function(x, options) {
paste('\n\n<div class="alert alert-danger">',
gsub('##', '\n', gsub('^##\ Error', '**Error**', x)),
'</div>', sep = '\n')
},
warning = function(x, options) {
paste('\n\n<div class="alert alert-warning">',
gsub('##', '\n', gsub('^##\ Warning:', '**Warning**', x)),
'</div>', sep = '\n')
},
message = function(x, options) {
paste('\n\n<div class="alert alert-info">',
gsub('##', '\n', x),
'</div>', sep = '\n')
}
)
# customize DT data table view
create_dt <- function(x){
DT::datatable(x %>% mutate_if(is.numeric, signif, digits=6),
extensions = "Buttons", options = list(dom = "Blfrtip", scrollX=TRUE,
buttons = c("copy", "csv", "excel", "pdf", "print"),
lengthMenu = list(c(10,25,50,-1), c(10,25,50,"All"))))
}
```
# Analytical Flow of Pipeline
Multi-environmental trial (MET) analyses are critical for plant breeding programs. In MET group of genotypes are evaluated in several environments (sites, years, or treatments) in order to assess their levels of consistency (or stability) of the performance across these different environments. This lack of stability is known as genotype-by-environment (GxE) interaction and is the result of specific gene responses to the particular conditions of a given environment.
**Single- versus Two-stage Analysis**
For the analyses of MET, ideally all trials should be evaluated by fitting a unique complete model. This is known as **single-stage analysis**. In here, the original phenotypic measurements from all trials are modeled together and genetic, non-genetic and residuals variances for each trial are estimated. This process can be challenging and often there are computational complications, mostly originating by the large number of variance components to simultaneously estimate.
An alternative to the above approach is to perform a **two-stages analysis**. This is divided into two steps. In the step 1 each trial is first analyzed individually. Here, all relevant factors, covariates and spatial components are considered, and once a final model is selected, adjusted means (BLUEs) are obtained for each of the genotypes in the trial together with its weights (further details later). Then, **in the step 2**, all individual trial analyses are combined and used to fit a single linear mixed-model (LMM) that considers weights.
Both the single-stage and two-stages analyses have advantages and disadvantages. The most important advantage of the two-stages analysis is that it allows considering any relevant design or statistical elements for each trial. This includes different aspects such as spatial analyses, replication, etc. It is common for a single-stage analysis to have too much complexity making it difficult or near impossible to fit, and often some model simplifications are required (e.g., dropping some model terms). These simplifications can translate into important losses of accuracy.
Similarly, in the two-stages analyses, there is some loss of information, as the parameter estimation is done in two steps. However, this separation, allows for the first step to have as much complexity as desired, and often better information is obtained from this stage.
There is another important advantage of the two-stages analysis, and this is related to developing operational analytical pipelines. A single-stage analysis requires all the raw data available for the analysis at the moment of fitting the model. In contrast, the two-stages analyses the first step can be executed in advance and this information can be then retrieved to fit a second step with the desired trials and genotypes. This has also the advantage that the adjusted means (and weights) from the first step can be used for other studies such as genomic prediction (GP) and/or genome-wide association studies (GWAS).
**Steps of the Two-stage Analysis**
The focus on this pipeline will be on the use of two-stage analyses. As indicated earlier, in the first step, the raw phenotypic data of each trial is analyzed. The idea is to use all available statistical tools to fit each of these trials providing the largest possible accuracy on the genotype’s predictions. In the current pipeline, each site is been evaluated considering genotype as a random effect (allowing for the estimation of heritability), and several model terms are considered to control for potential spatial heterogeneity. These include elements such as autoregressive errors or order 1 (AR1) for the rows and columns, polynomials or spline to model trends, random row or column effects nested within replicate also to model trends.
The procedure evaluates several of these models and according to a given criteria (AIC, BIC, heritability) where the best fitted model is selected. If a low heritability is found (threshold specified by the user) then the trials are not considered in the second step of this analysis. Otherwise, the fitting proceeds. Here, all variance component estimates are fixed to their estimates, and the model is refitted with the genotype effect considered as a fixed effect. From here, adjusted means (or BLUE means, or predictions) are obtained for each of the genotypes together with their weights. Weights are obtained as the diagonal of the inverse of the variance-covariance matrix of adjusted mean estimates (or predictions).
It is important to note, that the current pipeline (workflow) also considers a careful exploration of each individual trial fit, where residual plots are checked, outliers are detected (and eliminated) in a semi-automatic way, and several other statistics are available (such as coefficient of variation, CV%) in order to assess the quality of each of these model fits. These fits should be checked carefully, as the predictions obtained will be stored and possibly used for many years and for different statistical evaluations.
For the second step, information from each of the individual fits is collected. This is an important aspect of the workflow, as different sets of trials can be selected for different objectives. For example, only trials from the current year might be evaluated, but also trials from, for example drought years, might be combined into a single two-stage MET analysis.
One important aspect to consider in this stage before proceeding is to check connectivity between trials by auditing the collected data. The MET analysis will not converge if no genotypes are common between some pairs of trials, or if only very few genotypes are common between trials. As a rule of thumb, we recommend having at least 5 genotypes in common. However, note that important levels of unbalanced are often tolerated by LMMs (particularly in factor analytic structures), but still some connectivity is still required.
After this audit, it is possible to fit the LMM with complex structures. Here, the current workflow considers the structures of single correlation and common variance (CORV), single correlation and heterogeneous variances (CORH), and several factor analytic (FA) structures.
The focus of the current pipeline is mainly on the FA structure as this has several advantages such as: 1) greatest model flexibility with approximations to different correlations and heterogeneous variances, 2) straightforward interpretation of some of the components (loadings, specific, etc.), 3) allows for estimation of some correlations even under poor connectivity, 4) less convergency issues than other structures, such as CORGH and US. In addition, this structure allows for a better understanding of the patterns of GxE and some selection of individuals for underlying factors that affect the response, and therefore that induces the observed pattern of GxE effects found in the data.
Further details and illustrations of these steps are presented in the example presented here.
# Getting Started
There are several R libraries that are used for statistical analyses in the present workflow. The following code specifies which libraries are required, and these will be installed if not available.
* ASReml-R: requires a commercial license to be installed (for further information please check: [this](https://www.vsni.co.uk/software/asreml-r))
* QBMS: requires access (login) to the working databases from the BMS system.
The specifications of the analyses are presented in the file config.in that assists on the data manipulations and model fit.
```{r load_libraries, message = FALSE, warning = FALSE}
required_packages <- c("QBMS", "asreml", "ggplot2", "lattice", "gridExtra", "dplyr", "plyr", "ggrepel", "tidyr","ggfortify", "missMDA", "ggpubr", "pheatmap", "kableExtra", "DT", "configr", "remotes", "plotly")
not_installed <- required_packages[!(required_packages %in% installed.packages()[ , "Package"])]
if(length(not_installed)) install.packages(not_installed, repos = "http://cran.us.r-project.org")
#if (!require(QBMS)) remotes::install_github("icarda-git/QBMS")
if (!require(bsselectR)) remotes::install_github("walkerke/bsselectR")
# stage-1 required libraries
library(asreml)
library(ggplot2)
library(lattice)
library(gridExtra)
library(dplyr)
library(plyr)
library(QBMS)
library(plotly)
# stage-2 extra required libraries
library(ggrepel)
library(tidyr) # required for reshaping long to wide table of data
library(ggfortify) # required by the ./lib/GBiplot.R
library(missMDA) # required by the ./lib/GBiplot.R
library(ggpubr) # get_legend
library(pheatmap) # pretty heatmap!
# r markdown required libraries
library(kableExtra)
library(DT)
library(configr)
library(bsselectR)
# stage-1 required includes
source("./lib/bms_import.R")
source("./lib/detect_outliers.R")
source("./lib/mcomp_asreml.R")
source("./lib/gral_stats.R")
source("./lib/gral_single_wgt.R")
source("./lib/spatial_selector.R")
source("./lib/fieldmap.R")
# stage-2 required includes
source("./lib/auditMET.R")
source("./lib/stageMET.R")
source("./lib/GBiplot.R")
source("./lib/fa_plots.R")
# get model definitions
load(file = "./lib/MODLIST.Rda")
# remove previouse analysis outputs/plots
unlink("outputs/*", force = TRUE)
unlink("plots/*", force = TRUE)
config_file <- "config.ini"
if (!file.exists(config_file)) {
warning(paste(config_file, "not exists!"))
knitr::knit_exit()
}
config <- read.config(file = config_file)
cat(readLines(config_file), sep = '\n')
```
# Import and Loading Data
In this section we will be importing raw data directly from BMS. However, a standalone file with raw data can also be read and accessed.
```{r trial_info}
# the trial and trait names (e.g., IDYT39 and GY_Calc_kgha respectively)
study.name <- config$default$study.name
trait.name <- config$default$trait.name
```
## Import from BMS
If you are importing data from BMS (i.e., `source.bms` is TRUE), then `study.name` (e.g., "`r study.name`") should exist in the following `bms.program` in your `bms.crop`. Also, the `trait.name` (e.g., "`r trait.name`") should match your BMS ontology and has observed data. Please check these settings in the associated config file [here](./config.ini).
This analysis pipeline is using the [QBMS](https://icarda-git.github.io/QBMS/) R package to query the [Breeding Management System](https://bmspro.io/) database (using [BrAPI](https://brapi.org/) calls) and retrieves experiments data directly into R statistical analyzing environment.
>You will get a login pop-up window, use the your BMS credentials
```{r data_source}
# BMS is your data source?
source.bms <- (config$default$source.bms == "TRUE")
# define extra params required by BMS API to retrieve the data
bms.url <- config$bms$url
bms.crop <- config$bms$crop
bms.program <- config$bms$program
```
## Import from Other Sources
If you are reading data directly, then two CSV files are required here in the same directory of this script named as the following:
- `r paste0(study.name, "_germplasm.csv")` for your list of germplasms, it should contain two columns `germplasmName` and `check` (0 for test entries and 1 for check/control entries).
- `r paste0(study.name, "_", trait.name, "_data.csv")` for your trials data. Mandatory columns are `trial`, `gen` (genotypes, they should match with the ones in the previous germplasm list), `block` (replication), `ibk` (incomplete blocks), `col` and `row` (field layout), and `resp` (trait response).
>If no block (replication) is considered in the designs, this should be filled with a 1 in the dataset. Also, if no ibk (incomplete blocks), column or row coordinates are available, these should be provided as NA in the dataset.
Get the name of required input files and check if they exist:
```{r data_files}
if(source.bms){
# data will extracted directly from BMS server and save in intermediate csv files
files <- bms_import(study.name, trait.name, bms.crop, bms.program, bms.url)
geno_file <- files$germplasm
data_file <- files$data
}else{
# setup the expected names
geno_file <- paste0(study.name, "_germplasm.csv")
data_file <- paste0(study.name, "_", trait.name, "_data.csv")
}
if (!file.exists(data_file)) {
warning(paste(data_file, "not exists!"))
knitr::knit_exit()
}
if (!file.exists(geno_file)) {
warning(paste(geno_file, "not exists!"))
knitr::knit_exit()
}
```
## Check for Mandatory Columns
The following code is an internal check that the required columns are provided in their respective data frames.
```{r data_loading}
# load data
data <- read.csv(data_file)
germplasm.data <- read.csv(geno_file)
if("studyId" %in% colnames(data)) {
bms.studyId <- data$studyId[1]
data$studyId <- NULL
}
# QC: check for mandatory columns
oops <- setdiff(c("trial", "gen", "block", "ibk", "col", "row", "resp"), colnames(data))
if(length(oops) > 0) {
warning(paste("Can't find the following mandatory columns in your data file",
data_file, "\n", paste(oops, collapse = ", ")))
knitr::knit_exit()
}
oops <- setdiff(c("germplasmName", "check"), colnames(germplasm.data))
if(length(oops) > 0) {
warning(paste("Can't find the following mandatory columns in your germplasm file",
geno_file, "\n", paste(oops, collapse = ", ")))
knitr::knit_exit()
}
```
## Data Pre-processing
In this section, some conversions of columns within a data frame are performed to be ready for the downstream analyses (e.g., convert to factor, sorting, and unique identifier).
```{r data_preprocessing}
# make sure that design and experimental treatments are factors, and response is numeric
data$gen <- factor(data$gen)
data$trial <- factor(data$trial)
data$block <- factor(data$block)
data$ibk <- factor(data$ibk)
data$col <- factor(data$col)
data$row <- factor(data$row)
data$resp <- as.numeric(as.character(data$resp))
# you MUST sort your data into the order you specify in the rcov/residuals term
data <- data[with(data, order(trial, row, col)),]
# create a unique record identifier
data$id <- 1:dim(data)[1]
```
## Extract Check Entries
Extract the names of check entries and tag them in the data file:
>It is important to verify that these entries are the ones defined as checks (or controls).
```{r check_entries}
# germplasm list has a column refers if any entry is test(0) or check(1)
check.name <- germplasm.data$germplasmName[germplasm.data$check == 1]
# create check column (handle weird cases like extra spaces and/or uppercase/lowercase letters!)
data$check <- ifelse(gsub(" ","",tolower(data$gen)) %in% gsub(" ","",tolower(check.name)), 1, 0)
```
>Check entries are: `r check.name`
## Define Experimental Design
Define the type of experiment design for each trial and assess the availability of spatial information. The available designs in this pipeline are:
* ALPHA: alpha-lattice design.
* RCBD: randomized complete block design.
* AUGMENTED: RCBD augmented design in a RCBD.
* PREP: partially-replicated design.
```{r trial_type}
# create a new data.frame to report all issues related to the row/col data if there is any
spatial.issues <- data.frame("trial" = character(), "issue" = character(),
"row" = character(), "col" = character())
# create a new data.frame list experiment design of each trial (ALPHA, RCBD, AUGMENTED, PREP)
trial.type <- data.frame("trial" = character(), stringsAsFactors = FALSE)
#***************************************
#* Design has.block has.ibk
#*
#* CRD FALSE FALSE
#* AUGMENTED FALSE TRUE
#* RCBD TRUE FALSE
#* ALPHA TRUE TRUE
#***************************************
# can guess experiment design depends on presence/absence of block and ibk (i.e., rep and block) design factors
for(i in 1:nlevels(data$trial)){
site.data <- subset(data, trial == levels(data$trial)[i])
trial.type[i, "trial"] <- levels(data$trial)[i]
has.block <- !anyNA(site.data$block) && length(unique(site.data$block)) > 1
has.ibk <- !anyNA(site.data$ibk) && length(unique(site.data$ibk)) > 1
trial.type[i, "design"] <- ifelse(has.block,
ifelse(has.ibk, "ALPHA", "RCBD"),
ifelse(has.ibk, "AUGMENTED", "CRD"))
has.spatial <- !(anyNA(site.data$row) | anyNA(site.data$col))
if (has.spatial) {
# assure the row/column data contiguity in the full field dimensions
contiguous.data <- expand.grid(unique(site.data$row), unique(site.data$col))
colnames(contiguous.data) <- c("row", "col")
missing.coords <- setdiff(contiguous.data, site.data[,c("row", "col")])
if (!empty(missing.coords)) {
missing.coords$issue <- "missing row x col"
missing.coords$trial <- trial.type[i, "trial"]
spatial.issues <- rbind(spatial.issues, missing.coords)
}
# check if there is any duplicate row/col coordinates
duplicated.coords <- site.data[duplicated(site.data[,c("row", "col")]),]
if (!empty(duplicated.coords)) {
duplicated.coords <- duplicated.coords[,c("row", "col")]
duplicated.coords$issue <- "duplicated row x col"
duplicated.coords$trial <- trial.type[i, "trial"]
spatial.issues <- rbind(spatial.issues, duplicated.coords)
}
trial.type[i, "has.spatial"] <- paste0("Yes (", length(unique(site.data$row)), "x",
length(unique(site.data$col)), " row/col)")
} else {
trial.type[i, "has.spatial"] <- "NO"
}
}
create_dt(trial.type)
if (!empty(spatial.issues)) {
warning("has some issues in spatial data integrity!")
create_dt(spatial.issues)
}
```
## Field Maps
The following code provides a field map to assess the spatial distribution of the response variable for each trial with available spatial information. This is recommended in order to identify potential outliers, trait patterns in the field, potential post-blocking, etc.
```{r field_maps, warning = FALSE}
# draw fieldmaps!
field.plots <- list()
plot.names <- list()
for(i in 1:nlevels(data$trial)){
site.data <- subset(data, trial == levels(data$trial)[i])
if (anyNA(site.data$row) | anyNA(site.data$col)) next
field.plots[[i]] <- fieldmap(x=site.data, column="resp", textColumn="gen", asFactor=F,
rows="row", ranges="col", borderColumns="ibk",
main=paste(levels(data$trial)[i], trait.name))
plot.names[[i]] <- levels(data$trial)[i]
png(paste0("plots/fieldmap_", plot.names[[i]], "_", trait.name, ".png"))
print(field.plots[[i]])
dev.off()
}
if (length(plot.names) != 0) {
field.plots[sapply(field.plots, is.null)] <- NULL
plot.names[sapply(plot.names, is.null)] <- NULL
pages <- marrangeGrob(grobs = field.plots, nrow=1, ncol=1)
page.width <- 2 * max(as.numeric(data$col))
page.height <- 5 * max(as.numeric(data$row))
ggsave(paste0("outputs/", study.name, "_", trait.name, "_Stage-1_Fieldmap.pdf"),
plot = pages, width = page.width, height = page.height, dpi = 600, units = "cm", limitsize = FALSE)
select.plot <- paste0("plots/fieldmap_", plot.names, "_", trait.name, ".png")
names(select.plot) <- plot.names
bsselect(select.plot, type = "iframe", live_search = TRUE, show_tick = TRUE, elementId = "fieldmaps")$prepend[[1]]$html
} else {
message("There is no spatial data!")
}
```
```{r results = 'asis', echo = FALSE}
if (length(plot.names) == 0) { cat("<!--") }
```
>_Check your field maps [here](`r paste0("./outputs/",study.name, "_", trait.name, "_Stage-1_Fieldmap.pdf")`)_
```{r results = 'asis', echo = FALSE}
if (length(plot.names) == 0) { cat("-->") }
```
# Single-Environment Analysis
This is the first step of the two-stages analyses where each trial (or environment) is evaluated individually to obtain the ‘best’ possible fit for the trial under study.
## Basic Distribution Plots
These plots highly potential differences in the data that might affect the analyses and should alert the users. These plots assist identifying trials/individuals with large variations or large differences between replications/blocks.
```{r distribution_plots, fig.width = 6, fig.height = 6, dpi = 300, warning = FALSE}
par(mar = c(10, 4, 2, 2) + 0.1)
xyplot(resp ~ gen | trial, groups = block, data = data,
scales = list(draw = FALSE), strip = strip.custom(par.strip.text = list(cex = 0.8)))
plot_ly(data, y = ~resp, x = ~trial, type = "box", width = 800, height = 480)
plot_ly(data, y = ~resp, x = ~gen, type = "box", width = 800, height = 480)
```
```{r stage1_process, cache = FALSE, include = FALSE}
# define the criteria to exclude outliers
outlier.threshold.start <- as.numeric(config$outliers$threshold.start)
outlier.threshold.stop <- as.numeric(config$outliers$threshold.stop)
outlier.threshold.step <- as.numeric(config$outliers$threshold.step)
# file name to save outliers
outliers.file <- paste0("outputs/", study.name, "_", trait.name, "_Stage-1_Outliers.csv")
cleaned.file <- paste0("outputs/", study.name, "_", trait.name, "_Stage-1_Cleaned_Data.csv")
if (file.exists(outliers.file)) file.remove(outliers.file)
if (file.exists(cleaned.file)) file.remove(cleaned.file)
residual.plots <- list()
variogram.plots <- list()
residual.layout <- list()
# Summary Table for sites
summS <- data.frame("Environment" = character(),
"Type" = character(),
"nRep" = numeric(),
"nTotalEntries" = numeric(),
"nCheckEntries" = numeric(),
"nTestEntries" = numeric(),
"nonNA_nVar" = numeric(),
"nonNA_obs" = numeric(),
"nonNA_AvRep" = numeric(),
"grandMean" = numeric(),
"checksMean" = numeric(),
"newLinesMean" = numeric(),
"sel_model" = numeric(),
"heritVC" = numeric(),
"heritPEV" = numeric(),
"genotypicVar" = numeric(),
"residualVar" = numeric(),
"CV" = numeric(),
"Trait" = character(),
"NumMissing" = numeric(),
"LSD" = numeric(),
"Variance" = numeric(),
"SD" = numeric(),
"Min" = numeric(),
"Max" = numeric(),
"Range" = numeric(),
"Median" = numeric(),
"LowerQuartile" = numeric(),
"UpperQuartile" = numeric(),
"MeanRep" = numeric(),
"MinRep" = numeric(),
"MaxRep" = numeric(),
"WaldStatistic" = numeric(),
"WaldDF" = numeric(),
"Pvalue" = numeric(),
stringsAsFactors = FALSE)
# Prediction Table for sites
predS <- data.frame("gen" = character(),
"predicted.value" = numeric(),
"std.error" = numeric(),
"status" = character(),
"weight" = numeric(),
"Environment" = character(),
stringsAsFactors = FALSE)
# Prediction Table for sites (BLUP)
predBLUP <- data.frame("gen" = character(),
"predicted.value" = numeric(),
"std.error" = numeric(),
"status" = character(),
"weight" = numeric(),
"Environment" = character(),
stringsAsFactors = FALSE)
for(i in 1:nlevels(data$trial)){
# Restrict the single site data set
site.data <- subset(data, trial == levels(data$trial)[i])
# Determining type of (spatial) analysis to do
has.block <- !anyNA(site.data$block)
has.ibk <- !anyNA(site.data$ibk)
has.spatial <- !(anyNA(site.data$row) | anyNA(site.data$col))
# Summary of data for this trial
summS[i, "Environment"] <- levels(data$trial)[i] # Environment/location name
summS[i, "Type"] <- trial.type[i, "design"] # Experiment design
summS[i, "nRep"] <- length(unique(site.data$block)) # Number of replications
summS[i, "nTotalEntries"] <- length(unique(site.data$gen)) # Total Number of varieties (entries)
summS[i, "nCheckEntries"] <- length(unique(site.data$gen[site.data$check == 1])) # Total Number of Check entries
summS[i, "nTestEntries"] <- length(unique(site.data$gen[site.data$check == 0])) # Total Number of Test entries
nonNA.data <- site.data[!is.na(site.data$resp),]
summS[i, "nonNA_nVar"] <- length(unique(nonNA.data$gen)) # Total Number of non-missing varieties
summS[i, "nonNA_obs"] <- nrow(nonNA.data) # Total Number of non-missing observations
summS[i, "nonNA_AvRep"] <- summS[i,"nonNA_obs"]/summS[i,"nonNA_nVar"] # Average Number of replications for non-missing varieties
summS[i, "grandMean"] <- mean(nonNA.data$resp) # Grand mean mu
summS[i, "checksMean"] <- mean(nonNA.data$resp[nonNA.data$check == 1]) # Checks mean mu(c)
summS[i, "newLinesMean"] <- mean(nonNA.data$resp[nonNA.data$check == 0]) # New lines mean mu(L)
# Extra for BMSSummary.csv (BreedingView output for uploading to BMS)
summS[i, "Trait"] <- trait.name # Response trait name
summS[i, "NumMissing"] <- sum(is.na(site.data$resp)) # Number of missing values
summS[i, "Variance"] <- var(nonNA.data$resp) # Response Variance
summS[i, "SD"] <- sd(nonNA.data$resp) # Response Standard Deviation
summS[i, "Min"] <- min(nonNA.data$resp) # Minimum value of response
summS[i, "Max"] <- max(nonNA.data$resp) # Maximum value of response
summS[i, "Range"] <- summS[i, "Max"] - summS[i, "Min"] # Range of response values
summS[i, "Median"] <- median(nonNA.data$resp) # Median response value
summS[i, "LowerQuartile"] <- quantile(nonNA.data$resp)[[2]] # Lower Quartile value (25%)
summS[i, "UpperQuartile"] <- quantile(nonNA.data$resp)[[4]] # Upper Quartile value (75%)
if(has.block){
rep.mean <- aggregate(nonNA.data$resp, list(nonNA.data$block), FUN = mean)
summS[i, "MeanRep"] <- mean(rep.mean$x)
summS[i, "MinRep"] <- min(rep.mean$x)
summS[i, "MaxRep"] <- max(rep.mean$x)
# skip if replicated trial has only one valid replication data
if(trial.type[i,"design"] %in% c("ALPHA", "RCBD") & length(unique(site.data[!is.na(site.data$resp),"block"])) == 1) next
# skip if replication plots have the same response values (copy past) to avoid singularity case
tmp <- site.data[rownames(unique(site.data[c("gen", "block")])), c("gen", "block", "resp")]
tmp <- reshape(tmp, idvar = "gen", timevar = "block", direction = "wide")
tmp <- apply(tmp[,-1], 1, function(x) sum(x) - x[1]*length(x))
if(all(tmp == 0)) next
}
if(has.ibk){
ibk.field <- "ibk"
} else {
ibk.field <- NULL
}
type.gen <- "random"
if(trial.type[i,"design"] == "AUGMENTED"){
type.gen <- "fixed"
message(paste(levels(data$trial)[i],
"- Genetic effect treated as fixed as we are evaluating an Augmented/Unreplicated design"))
strict.level <- 2
if(sum(table(site.data$gen) > 1) < 3){
message(paste(levels(data$trial)[i],
"- You have less than 3 genotypes/control replicated, your variance components
might be unstable (e.g., getting weird heritability estimation)!"))
}
}else{
strict.level <- 1
}
if (has.spatial) {
# Performing Spatial Analysis
# assure the row/column data contiguity in the full field dimensions
contiguous.data <- expand.grid(unique(site.data$row), unique(site.data$col))
colnames(contiguous.data) <- c("row", "col")
site.data <- merge(contiguous.data, site.data, by = c("row","col"), all.x = TRUE)
site.data <- site.data[with(site.data, order(row, col)),]
# include/exclude checks discussion (same population vs. higher variance e.g. susceptible/resistance)
test.sel <- spatial.selector(data=site.data, gen="gen", check=NULL, row="row", col="col",
resp="resp", type.gen=type.gen, block="block", ibk=ibk.field,
type.block="fixed", strict=strict.level)
# Best spatial model selected according to minimum A.opt (Augmented) or max heritPEV (all other)
if(trial.type[i,"design"] == "AUGMENTED"){
sel <- test.sel$parms[which.min(test.sel$parms$A.opt),"MODEL"]
}else{
sel <- test.sel$parms[which.max(test.sel$parms$heritPEV),"MODEL"]
}
summS[i, "sel_model"] <- sel # selected spatial model
output.ID <- spatial.selector(data=site.data, gen="gen", check=NULL, row="row", col="col",
resp="resp", type.gen=type.gen, block="block", ibk=ibk.field,
type.block="fixed", model=sel, strict=strict.level)
}else{
# No-Spatial but different design scenarios
summS[i, "sel_model"] <- 0 # no spatial model
# Performing No-Spatial Analysis: y = mu + block + block:ibk + gen + e (not dealing checks)
output.ID <- gral.single(data=site.data, gen="gen", resp="resp",
block="block", ibk=ibk.field, add.block=has.block, add.ibk=has.ibk,
type.gen=type.gen, type.block="fixed", type.residual="indep")
}
# if there are singularities in the average information matrix.
if (is.na(output.ID$call)) {
summS[i, "sel_model"] <- NA
next
}
# detect outliers
outliers <- detect_outliers(site.data, output.ID$mod,
outlier.threshold.start, outlier.threshold.stop, outlier.threshold.step)
# remove outliers
clean.data <- outliers$data[!is.na(outliers$data$id),]
data$resp[data$id == clean.data$id] <- clean.data$resp
data$scres[data$id == clean.data$id] <- clean.data$scres
if(length(outliers$id) > 0){
for(j in 1:length(outliers$id)) data[data$id == outliers$id[j],]$scres <- outliers$scres[j]
# report the outliers to review
write.table(outliers$verify, file = outliers.file, sep = ",", row.names = FALSE, col.names = FALSE, append = TRUE)
}
write.table(clean.data, file = cleaned.file, sep = ",", row.names = FALSE, col.names = FALSE, append = TRUE)
# Running final model with removed outliers
if (has.spatial) {
setup <- mod.list[sel,]
output.REPr <- gral.single(data=outliers$data, gen="gen", row="row", col="col",
block="block", ibk=ibk.field, resp="resp",
add.block=TRUE, add.row=setup[2], add.col=setup[3],
add.spl.row=as.logical(setup[4]), add.spl.col=as.logical(setup[5]), add.ibk=TRUE,
type.gen=type.gen, type.block="fixed", type.residual=setup[6],
add.nugget=as.logical(setup[7]), fix.vc=FALSE)
# Running Final model to generate predictions for 2-stage to STORE
output.REPf <- gral.single(data=outliers$data, gen="gen", row="row", col="col",
block="block", ibk=ibk.field, resp="resp",
add.block=TRUE, add.row=setup[2], add.col=setup[3],
add.spl.row=as.logical(setup[4]), add.spl.col=as.logical(setup[5]), add.ibk=TRUE,
type.gen=type.gen, type.block="fixed", type.residual=setup[6],
add.nugget=as.logical(setup[7]), fix.vc=TRUE)
}else{
output.REPr <- gral.single(data=outliers$data, gen="gen", resp="resp",
block="block", ibk=ibk.field, add.block=has.block, add.ibk=has.ibk,
type.gen=type.gen, type.block="fixed", type.residual="indep", fix.vc=FALSE)
# Running Final model to generate predictions for 2-stage to STORE
output.REPf <- gral.single(data=outliers$data, gen="gen", resp="resp",
block="block", ibk=ibk.field, add.block=has.block, add.ibk=has.ibk,
type.gen=type.gen, type.block="fixed", type.residual="indep", fix.vc=TRUE)
}
plots <- plot(output.REPr$mod) # Residual plots
residual.plots[[i]] <- grid.arrange(plots$histogram, plots$fitted, plots$qq, plots$rowlabels, nrow=2, top=summS$Environment[i])
variogram.plots[[i]] <- plot(varioGram(output.REPr$mod), main=summS$Environment[i]) # Variogram fitted model
png(paste0("plots/residual_", summS$Environment[i], "_", trait.name, ".png"))
plot(residual.plots[[i]])
dev.off()
png(paste0("plots/variogram_", summS$Environment[i], "_", trait.name, ".png"))
print(variogram.plots[[i]])
dev.off()
if (has.spatial) {
residual.layout[[i]] <- ggplot(outliers$data, aes(row, col, fill = scres)) + geom_tile() +
scale_fill_gradient2(low = "#DD2525", mid = "#FFFCBF", high = "#258025") +
ggtitle(paste("Residuals Layout -", summS$Environment[i])) +
theme(
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.background = element_rect(fill = "transparent",colour = NA),
plot.background = element_rect(fill = "transparent",colour = NA)
)
png(paste0("plots/res_layout_", summS$Environment[i], "_", trait.name, ".png"))
print(residual.layout[[i]])
dev.off()
}
# output.REPr$mod$call
# output.REPr$gt # Goodness-of-fit statistics
# output.REPr$aov # Wald-test (mixed model ANOVA-like table)
# summary(output.REPr$mod)$varcomp # Variance Components
# resid(output.REPr$mod) # Fit LMM (ideally spatial)
summS[i, "heritVC"] <- output.REPr$gt$herit.VC
summS[i, "heritPEV"] <- output.REPr$gt$herit.PEV
summS[i, "genotypicVar"] <- output.REPr$gt$v.gen
summS[i, "residualVar"] <- output.REPr$gt$v.res
summS[i, "CV"] <- output.REPr$gt$cv.VC
summS[i, "LSD"] <- output.REPr$gt$lsd
# Extra for BMSSummary.csv (BreedingView output for uploading to BMS)
summS[i, "WaldStatistic"] <- output.REPf$aov["gen","F.inc"]
summS[i, "WaldDF"] <- output.REPf$aov["gen","Df"]
summS[i, "Pvalue"] <- output.REPf$aov["gen","Pr"]
# Store Predictions: output.REPf
output.REPf$predictions$Environment <- summS$Environment[i]
predS <- rbind(predS, output.REPf$predictions)
output.REPr$predictions$Environment <- summS$Environment[i]
predBLUP <- rbind(predBLUP, output.REPr$predictions)
}
variogram.plots[sapply(variogram.plots, is.null)] <- NULL
pages <- marrangeGrob(grobs = variogram.plots, nrow=1, ncol=1)
ggsave(paste0("outputs/", study.name, "_", trait.name, "_Stage-1_Variogram_Plots.pdf"),
plot = pages, width = 20, height = 20, dpi = 600, units = "cm")
residual.plots[sapply(residual.plots, is.null)] <- NULL
pages <- marrangeGrob(grobs = residual.plots, nrow=1, ncol=1)
ggsave(paste0("outputs/", study.name, "_", trait.name, "_Stage-1_Residual_Plots.pdf"),
plot = pages, width = 20, height = 20, dpi = 600, units = "cm")
if (length(residual.layout) != 0) {
residual.layout[sapply(residual.layout, is.null)] <- NULL
pages <- marrangeGrob(grobs = residual.layout, nrow=1, ncol=1)
ggsave(paste0("outputs/", study.name, "_", trait.name, "_Stage-1_Residual_Layouts.pdf"),
plot = pages, width = 20, height = 20, dpi = 600, units = "cm")
}
write.csv(summS, paste0("outputs/", study.name, "_", trait.name, "_Stage-1_Summary_Statistics.csv"))
write.csv(predS, paste0("outputs/", study.name, "_", trait.name, "_Stage-1_Predictions.csv"))
write.csv(predBLUP, paste0("outputs/", study.name, "_", trait.name, "_Stage-1_BLUPs.csv"))
# fix headers in the outliers CSV file
if (file.exists(outliers.file)) {
temp <- read.csv(outliers.file, header=FALSE)
colnames(temp) <- c(colnames(outliers$data), "outlier")
write.csv(temp, outliers.file, row.names=FALSE)
}
# fix headers in the cleaned data CSV file
temp <- read.csv(cleaned.file, header=FALSE)
colnames(temp) <- colnames(clean.data)
write.csv(temp, cleaned.file, row.names=FALSE)
```
## Summary Table of Data
- `Environment`: Environment/location name.
- `Type`: Experiment design.
- `nRep`: Number of replications.
- `nTotalEntries`: Total Number of entries (genotypes).
- `nCheckEntries`: Total Number of check entries.
- `nTestEntries`: Total Number of test entries.
- `nonNA_nVar`: Total Number of non-missing entries.
- `nonNA_obs`: Total Number of non-missing observations.
- `nonNA_AvRep`: Average Number of replications for non-missing entries.
- `grandMean`: Grand mean mu.
- `checksMean`: Checks mean mu(c).
- `newEntriesMean`: New entries mean mu(L).
- `Trait`: Response trait name.
- `NumMissing`: Number of missing values.
- `Min`: Minimum value of response.
- `Max`: Maximum value of response.
- `Range`: Range of response values.
- `Median`: Median response value.
- `LowerQuartile`: Lower Quartile value (25%).
- `UpperQuartile`: Upper Quartile value (75%).
- `MeanRep`: Average number of replications.
- `MinRep`: Minimum number of replications.
- `MaxRep`: Maximum number of replications.
```{r data_summary}
create_dt(summS[,c(1:12, 19:20, 24:32)])
```
## Detecting Outliers
These are the residuals and variograms from each trial available to identify potential outliers (scres correspond to the standardized conditional residuals). The interactive graphs are presented below for exploration. However, in addition, some outliers are reported in the database based on the pre-specified thresholds (see table below). Note that some are identified as such (outlier = 1), and counterpart observations (outlier = 0) for the same germplasms in other replications. Close exploration of this database should ensure that the corresponding observations are eliminated.
```{r residual_plots, warning = FALSE}
select.environment <- summS[!is.na(summS$sel_model) & !is.na(summS$residualVar), "Environment"]
select.residual <- paste0("plots/residual_", select.environment, "_", trait.name, ".png")
names(select.residual) <- select.environment
bsselect(select.residual,
type = "iframe",
live_search = TRUE,
show_tick = TRUE,
elementId = 'residuals')$prepend[[1]]$html
```
```{r variogram_plots, warning = FALSE}
select.environment <- summS[!is.na(summS$sel_model) & !is.na(summS$residualVar), "Environment"]
select.variogram <- paste0("plots/variogram_", select.environment, "_", trait.name, ".png")
names(select.variogram) <- select.environment
bsselect(select.variogram,
type = "iframe",
live_search = TRUE,
show_tick = TRUE,
elementId = "variograms")$prepend[[1]]$html
```
```{r residual_layouts, warning = FALSE}
select.environment <- summS[!is.na(summS$sel_model) & trial.type$has.spatial != "NO" & !is.na(summS$residualVar), "Environment"]
select.residual <- paste0("plots/res_layout_", select.environment, "_", trait.name, ".png")
names(select.residual) <- select.environment
if (length(select.environment) != 0) {
bsselect(select.residual,
type = "iframe",
live_search = TRUE,
show_tick = TRUE,
elementId = 'res_layouts')$prepend[[1]]$html
}
```
```{r detecting_outliers}
create_dt(temp)
```
- Description of graphical/numerical step 1 output:
- `residual.plots`: Residual plots.
- `variogram.plots`: Variogram fitted model.
>_Check your residual plots [here](`r paste0("./outputs/", study.name, "_", trait.name, "_Stage-1_Residual_Plots.pdf")`)_
>_Check your variogram fitted model [here](`r paste0("./outputs/", study.name, "_", trait.name, "_Stage-1_Variogram_Plots.pdf")`)_
```{r results = 'asis', echo = FALSE}
if (length(select.environment) == 0) { cat("<!--") }
```
>_Check your residual layouts [here](`r paste0("./outputs/", study.name, "_", trait.name, "_Stage-1_Residual_Layouts.pdf")`)_
```{r results = 'asis', echo = FALSE}
if (length(select.environment) == 0) { cat("-->") }
```
## Summary Table of Final Models
The following table presents a summary of the final models selected for each of the individual trials. The final model were fitted after the elimination of outliers (outlier = 1). The procedure in this case fits a total of 26 different spatial models (only 8 in case of Augmented design trials) and selects the best model according to the specified criteria.
It is critical to explore this output database, as from here some of the trials with poor performance (e.g., heritVC < 0.10) should be eliminated as they might have installing or phenotypic issues and therefore they are unlikely to contribute to the downstream MET analyses. This summary table has several relevant statistics that can be used in combination to assess their quality.
- `Environment`: Environment/location name.
- `sel_model`: Selected model (see next section).
- `heritVC`: The heritability calculated based on variance components.
- `heritPEV`: The heritability calculated based on the predictor error variance (only for genotypes considered as *random*).
- `genotypicVar`: Genotypic Variance.
- `residualVar`: Residual Variance.
- `CV`: Coefficient of Variation.
- `LSD`: Least Significant Difference.
- `Variance`: Total Variance of the response variable.
- `SD`: Response Standard Deviation.
- `WaldStatistic`: Wald Chi-Squared Test statistic (mixed model ANOVA-like table).
- `WaldDF`: Wald-test Degree of Freedom.
- `Pvalue`: p-value for the test line.
```{r final_models_summary}
create_dt(summS[,c(1, 13:18, 21:23, 33:35)])
```
## Models Selection
The table below presents the final models selected according to the criteria specified earlier. The model specifications presented here are just informative, but they provide an idea of the relevance of some model terms in the particular dataset selected.
- Spatial Model selector
- `Model`: A numeric value with the number of the selected Model from the list of evaluated models.
- `RepRow`: If TRUE the factor for random effects of row (or row within replicated if replicate is available) will be added to the model.
- `RepCol`: If TRUE the factor for random effects of column (or column within replicated if replicate is available) will be added to the model.
- `splineRow`: If TRUE the factor for random effects of spline for row together with a fixed effect covariate for row will be added to the model.
- `splineCol`: If TRUE the factor for random effects of spline for column together with a fixed effect covariate for column will be added to the model.
- `Resid`: A string with the specification of type of residual terms to fit. Options are: ‘indep’ and ‘ar1’ for independent errors, and autoregressive of order 1 for both rows and columns.
- `Nugget`: If TRUE a nugget random effect (only for autoregressive errors) will be added to the model.
- `List`: A numeric value indicating the list category (or priority) to be used to subset the list of evaluated models.
- Non-spatial model selector
- Fitting Criteria
```{r model_selection}
if (length(select.environment) != 0) {
create_dt(merge(summS[,c("Environment", "sel_model")],
mod.list,
by.x = "sel_model",
by.y = "Model",
sort = FALSE))
} else {
warning("There is no spatial data!")
}
```
## Obtaining Predictions
This table is the main output table generated from the first step analyses, where the adjusted means (BLUEs), are obtained from the first step analysis, they need to be stored for further use, as these single-site analyses will not need to be redone in the future.
- `gen`: Genotype name.
- `predicted.value`: Predicted response value (BLUEs).
- `std.error`: Standard Error of predicted value.
- `status`: A comment to indicate if this predicted value is estimable.
- `weight`: The inverse of standard error.
- `Environment`: Environment name.
```{r obtaining_predictions}
create_dt(predS)
```
## BMS Output
```{r bms_output}
folder.name <- paste0("BMS_", format(Sys.Date(), "%Y-%m-%d"), "T", format(Sys.time(), "%H-%M-%S"))
if ("TRIAL_INSTANCE" %in% colnames(data)) {
dir.create(paste0("./outputs/", folder.name))
col.mean <- paste0(trait.name, "_Means")
col.se <- paste0(trait.name, "_UnitErrors")
BMSOutput <- data.frame(matrix(ncol = 6, nrow = dim(predS)[1]))
colnames(BMSOutput) <- c("TRIAL_INSTANCE", "LOCATION_NAME", "ENTRY_NO", "GID", col.mean, col.se)
lookup.TRIAL_INSTANCE <- unique(data[,c("trial", "TRIAL_INSTANCE")])
bms.predS <- merge(predS, lookup.TRIAL_INSTANCE, by.x = "Environment", by.y = "trial", all.x = TRUE)
lookup.germplasm <- germplasm.data[,c("germplasmName", "germplasmDbId", "entryNumber")]
lookup.germplasm$germplasmName <- gsub(" ","",tolower(lookup.germplasm$germplasmName))
lookup.germplasm <- lookup.germplasm[!duplicated(lookup.germplasm$germplasmName),]
bms.predS$gen <- gsub(" ","",tolower(bms.predS$gen))
bms.predS <- merge(bms.predS, lookup.germplasm, by.x = "gen", by.y = "germplasmName", all.x = TRUE)
bms.predS <- bms.predS[with(bms.predS, order(Environment, gen)),]
BMSOutput$TRIAL_INSTANCE <- bms.predS$TRIAL_INSTANCE
BMSOutput$LOCATION_NAME <- bms.predS$Environment
BMSOutput$ENTRY_NO <- bms.predS$entryNumber
BMSOutput$GID <- bms.predS$germplasmDbId
BMSOutput[,c(col.mean)] <- bms.predS$predicted.value
BMSOutput[,c(col.se)] <- bms.predS$std.error
write.csv(BMSOutput, paste0("./outputs/", folder.name, "/BMSOutput.csv"), row.names = FALSE)
BMSSummary <- data.frame(matrix(ncol = 28, nrow = dim(summS)[1]))
colnames(BMSSummary) <- c("TRIAL_INSTANCE", "LOCATION_NAME", "Trait", "NumValues",
"NumMissing", "Mean", "Variance", "SD", "Min", "Max", "Range",
"Median", "LowerQuartile", "UpperQuartile", "MeanRep", "MinRep",
"MaxRep", "MeanSED", "MinSED", "MaxSED", "MeanLSD", "MinLSD",
"MaxLSD", "CV", "Heritability", "WaldStatistic", "WaldDF", "Pvalue")
bms.summS <- merge(summS, lookup.TRIAL_INSTANCE, by.x = "Environment", by.y = "trial", all.x = TRUE)
BMSSummary$TRIAL_INSTANCE <- bms.summS$TRIAL_INSTANCE
BMSSummary$LOCATION_NAME <- bms.summS$Environment
BMSSummary$Trait <- bms.summS$Trait
BMSSummary$NumValues <- bms.summS$nonNA_obs
BMSSummary$NumMissing <- bms.summS$NumMissing
BMSSummary$Mean <- bms.summS$grandMean
BMSSummary$Variance <- bms.summS$Variance
BMSSummary$SD <- bms.summS$SD
BMSSummary$Min <- bms.summS$Min
BMSSummary$Max <- bms.summS$Max
BMSSummary$Range <- bms.summS$Range
BMSSummary$Median <- bms.summS$Median
BMSSummary$LowerQuartile <- bms.summS$LowerQuartile
BMSSummary$UpperQuartile <- bms.summS$UpperQuartile
BMSSummary$MeanRep <- bms.summS$MeanRep
BMSSummary$MinRep <- bms.summS$MinRep
BMSSummary$MaxRep <- bms.summS$MaxRep
BMSSummary$MeanSED <- merge(bms.summS$Environment,