-
Notifications
You must be signed in to change notification settings - Fork 1
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Combined table output #144
Changes from 27 commits
7199d18
bb95cbe
bf2d654
0747302
487ede8
3eddab2
a67a1b0
45c03a7
b2e31f0
5a15b22
10380f1
c227716
91a56a0
fce66ad
26465d9
c101fb4
4ad7f54
667fe32
95d016c
6d0b24a
713a60f
f0767b8
8b6f7b7
8d6474e
dc622c9
f495c7b
68fa9d4
12cbe01
b53d933
e78833b
9717d45
63b812a
e9f7145
6ad632f
97d9a07
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,135 @@ | ||
#' Create combined traits.build table | ||
#' | ||
#' Create a single database output that merges together the information | ||
#' in all relational tables within a traits.build database. | ||
#' Trait measurements are still output in long format (1 row per trait value), | ||
#' but all measurement-related metadata (methods, location properties, context properties, contributors) | ||
#' are now included as additional columns in a single table. | ||
#' | ||
#' @param database A traits.build database | ||
#' | ||
#' @return | ||
#' @export | ||
#' | ||
#' @examples | ||
database_create_combined_table <- function(database) { | ||
|
||
location_latlon <- | ||
database$locations %>% | ||
dplyr::filter(location_property %in% c("latitude (deg)", "longitude (deg)")) %>% | ||
tidyr::pivot_wider(names_from = location_property, values_from = value) | ||
|
||
location_properties <- | ||
database$locations %>% | ||
dplyr::filter(!location_property %in% c("latitude (deg)", "longitude (deg)")) %>% | ||
dplyr::mutate( | ||
location_property = stringr::str_replace_all(location_property, "=", "-"), | ||
value = stringr::str_replace_all(value, "=", "-"), | ||
location_property = stringr::str_replace_all(location_property, ";", ","), | ||
value = stringr::str_replace_all(value, ";", ",") | ||
) %>% | ||
dplyr::mutate(location_properties = paste0(location_property, "=", value)) %>% | ||
dplyr::select(dplyr::all_of(c("dataset_id", "location_id", "location_name", "location_properties"))) %>% | ||
dplyr::group_by(dataset_id, location_id, location_name) %>% | ||
dplyr::mutate(location_properties = paste0(location_properties, collapse = "; ")) %>% | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Would There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. possibly but, I think |
||
dplyr::ungroup() %>% | ||
dplyr::distinct() | ||
|
||
contributors <- | ||
database$contributors %>% | ||
dplyr::mutate( | ||
affiliation = stringr::str_replace_all(affiliation, ":", "-"), | ||
affiliation = stringr::str_replace_all(affiliation, ";", ","), | ||
affiliation = stringr::str_replace_all(affiliation, "<", "("), | ||
affiliation = stringr::str_replace_all(affiliation, ">", ")"), | ||
additional_role = stringr::str_replace_all(additional_role, "<", "("), | ||
additional_role = stringr::str_replace_all(additional_role, ">", ")"), | ||
data_collectors = paste0(given_name, " ", last_name), | ||
data_collectors = ifelse( | ||
!is.na(ORCID), | ||
paste0(data_collectors, " <ORCID:", ORCID), | ||
data_collectors), | ||
data_collectors = ifelse( | ||
is.na(ORCID), | ||
paste0(data_collectors, " <affiliation:", affiliation), | ||
paste0(data_collectors, ";affiliation:", affiliation)), | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Should we add spaces after the semicolons and use equal (=) signs instead of colons, consistent with |
||
data_collectors = ifelse( | ||
!is.na(additional_role), | ||
paste0(data_collectors, ";additional_role:", additional_role, ">"), | ||
paste0(data_collectors, ">")) | ||
) %>% | ||
dplyr::select(-dplyr::all_of(c("last_name", "given_name", "ORCID", "affiliation", "additional_role"))) %>% | ||
dplyr::group_by(dataset_id) %>% | ||
dplyr::mutate(data_collectors = paste0(data_collectors, collapse = "; ")) %>% | ||
dplyr::ungroup() %>% | ||
dplyr::distinct() | ||
|
||
contexts_tmp <- | ||
database$contexts %>% | ||
dplyr::mutate( | ||
context_property = stringr::str_replace_all(context_property, "=", "-"), | ||
value = stringr::str_replace_all(value, "=", "-"), | ||
description = stringr::str_replace_all(description, "=", "-"), | ||
context_property = stringr::str_replace_all(context_property, ";", ","), | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Do the characters "<" and ">" also need to be replaced in case they ever make it into the |
||
value = stringr::str_replace_all(value, ";", ","), | ||
description = stringr::str_replace_all(description, "=", "-"), | ||
yangsophieee marked this conversation as resolved.
Show resolved
Hide resolved
|
||
value = ifelse( | ||
is.na(description), | ||
paste0(context_property, ":", value), | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Similarly here I wonder if "=" would be more readable than ":" |
||
paste0(context_property, ":", value, " <", description, ">")) | ||
) %>% | ||
dplyr::select(-dplyr::all_of(c("description", "context_property", "category"))) %>% | ||
tidyr::separate_longer_delim(link_vals, ", ") %>% | ||
distinct() | ||
|
||
reformat_contexts <- function(contexts_table, context_id) { | ||
context_category <- gsub("_id", "_properties", context_id, fixed = TRUE) | ||
out <- contexts_table %>% | ||
dplyr::filter(link_id == context_id) %>% | ||
dplyr::select(-link_id) %>% | ||
dplyr::distinct(dataset_id, link_vals, .keep_all = TRUE) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is this line necessary? Shouldn't it be distinct anyway, otherwise there's something wrong with the contexts table that we want to be picked up by |
||
|
||
names(out)[which(names(out) == "value")] <- context_category | ||
names(out)[which(names(out) == "link_vals")] <- context_id | ||
out | ||
} | ||
|
||
join_contexts <- function(data, contexts_tmp) { | ||
data %>% | ||
dplyr::left_join( | ||
by = c("dataset_id", "treatment_context_id"), | ||
reformat_contexts(contexts_tmp, "treatment_context_id") | ||
) %>% | ||
dplyr::left_join( | ||
by = c("dataset_id", "plot_context_id"), | ||
reformat_contexts(contexts_tmp, "plot_context_id") | ||
) %>% | ||
dplyr::left_join( | ||
by = c("dataset_id", "entity_context_id"), | ||
reformat_contexts(contexts_tmp, "entity_context_id") | ||
) %>% | ||
dplyr::left_join( | ||
by = c("dataset_id", "temporal_context_id"), | ||
reformat_contexts(contexts_tmp, "temporal_context_id") | ||
) %>% | ||
dplyr::left_join( | ||
by = c("dataset_id", "method_context_id"), | ||
reformat_contexts(contexts_tmp, "method_context_id") | ||
) | ||
} | ||
|
||
combined_table <- | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'm wondering if the ID columns that help join to other relational tables could be removed from the combined table? |
||
database$traits %>% | ||
dplyr::left_join(location_latlon, by = c("dataset_id", "location_id")) %>% | ||
dplyr::left_join(location_properties, by = c("dataset_id", "location_id", "location_name")) %>% | ||
austraits::join_contexts(contexts_tmp) %>% | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Don't you define a function called |
||
dplyr::left_join( | ||
database$methods %>% dplyr::select(-dplyr::all_of(c("data_collectors"))), | ||
by = c("dataset_id", "trait_name", "method_id") | ||
) %>% | ||
dplyr::left_join(contributors, by = c("dataset_id")) %>% | ||
dplyr::left_join(database$taxa, by = c("taxon_name")) %>% | ||
dplyr::left_join(database$taxonomic_updates, by = c("taxon_name", "dataset_id", "original_name")) | ||
|
||
combined_table | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -1176,6 +1176,20 @@ dataset_test_worker <- | |
info = paste0(red(dataset_id), "\t`db_traits_pivot_longer` threw a warning") | ||
) | ||
} | ||
|
||
expect_no_error( | ||
combined_table <- database_create_combined_table(dataset), | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This tests creating a combined table for a single dataset, which may very well be intended. Will we also add |
||
info = paste0(red(dataset_id), "\t`database_create_combined_table`") | ||
) | ||
|
||
expect_equal( | ||
nrow(combined_table), nrow(dataset$traits), | ||
info = sprintf( | ||
"%s\tnumber of rows of combined table not equal to rows of original traits table", | ||
red(dataset_id) | ||
) | ||
) | ||
|
||
} | ||
}) | ||
} | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Needs documentation