This repository has been archived by the owner on Apr 15, 2022. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy path2021-12-13_xindi_combine_variable_synopsis.R
124 lines (98 loc) · 4.3 KB
/
2021-12-13_xindi_combine_variable_synopsis.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
# Description: combine BPC variable synopsis from multiple cohorts.
# Author: Haley Hunter-Zinck
# Date: 2021-12-13
# setup ----------------------------
tic = as.double(Sys.time())
library(glue)
library(dplyr)
library(synapser)
synLogin()
# synapse
synid_file_crc <- "syn26077307"
synid_file_nsclc <- "syn26028547"
synid_file_brca <- "syn26077309"
synid_folder_output <- "syn26529348"
# functions ----------------------------
#' Read contents of an Excel Spreadsheet stored on Synapse.
#'
#' @param synapse_id Synapse ID of the spreadsheet
#' @param version Version of the file
#' @param sheet Number of the sheet of the spreadsheet
#' @param check.names Whether R should modify names if non-conforming
#' @return Matrix of data
#' @example
#' get_synapse_entity_data_in_xlsx(synapse_id = "syn123345", sheet = 2)
get_synapse_entity_data_in_xlsx <- function(synapse_id,
version = NA,
sheet = 1,
check.names = F) {
library(openxlsx)
if (is.na(version)) {
entity <- synGet(synapse_id)
} else {
entity <- synGet(synapse_id, version = version)
}
data <- read.xlsx(entity$path, check.names = check.names, sheet = sheet)
return(data)
}
#' Store a file on Synapse with options to define provenance.
#'
#' @param path Path to the file on the local machine.
#' @param parent_id Synapse ID of the folder or project to which to load the file.
#' @param file_name Name of the Synapse entity once loaded
#' @param prov_name Provenance short description title
#' @param prov_desc Provenance long description
#' @param prov_used Vector of Synapse IDs of data used to create the current
#' file to be loaded.
#' @param prov_exec String representing URL to script used to create the file.
#' @return Synapse ID of entity representing file
save_to_synapse <- function(path,
parent_id,
file_name = NA,
prov_name = NA,
prov_desc = NA,
prov_used = NA,
prov_exec = NA) {
if (is.na(file_name)) {
file_name = path
}
file <- File(path = path, parentId = parent_id, name = file_name)
if (!is.na(prov_name) || !is.na(prov_desc) || !is.na(prov_used) || !is.na(prov_exec)) {
act <- Activity(name = prov_name,
description = prov_desc,
used = prov_used,
executed = prov_exec)
file <- synStore(file, activity = act)
} else {
file <- synStore(file)
}
return(file$properties$id)
}
# read ----------------------------
synop_crc <- get_synapse_entity_data_in_xlsx(synapse_id = synid_file_crc,
sheet = 2)
synop_nsclc <- get_synapse_entity_data_in_xlsx(synapse_id = synid_file_nsclc,
sheet = 2)
synop_brca <- get_synapse_entity_data_in_xlsx(synapse_id = synid_file_brca,
sheet = 2)
# main ----------------------------
synop_all <- bind_rows(bind_rows(synop_brca, synop_crc), synop_nsclc)
idx_dup <- which(duplicated(synop_all$Variable.Name))
synop_all <- synop_all[-idx_dup, ] %>%
mutate(BrCa = is.element(Variable.Name, synop_brca$Variable.Name)) %>%
mutate(CRC = is.element(Variable.Name, synop_crc$Variable.Name)) %>%
mutate(NSCLC = is.element(Variable.Name, synop_nsclc$Variable.Name)) %>%
select(Dataset, Variable.Name, Field.Label, Data.Type, Values, BrCa, CRC, NSCLC)
# write ----------------------------
file_output <- "BPC-consortium_variables_synopsis.xlsx"
write.xlsx(synop_all, file = file_output, overwrite = T)
save_to_synapse(path = file_output,
parent_id = synid_folder_output,
prov_name = "merged variable synopsis",
prov_desc = "variable synopsis from merging variable synopsis from BrCa, CRC, and NSCLC cohorts",
prov_used = c(synid_file_brca, synid_file_crc, synid_file_nsclc),
prov_exec = "https://github.com/hhunterzinck/genie_requests/blob/main/2021-12-13_xindi_combine_variable_synopsis.R")
file.remove(file_output)
# close out ----------------------------
toc = as.double(Sys.time())
print(glue("Runtime: {round(toc - tic)} s"))