From c4405dc71fc893a9262a3d24a28aa300e6df8684 Mon Sep 17 00:00:00 2001 From: Rob Baker Date: Tue, 10 Sep 2024 20:27:11 -0600 Subject: [PATCH] updates to JOSS manuscript --- paper.bib | 127 ++++++++++++++++++++++++++++++++++++++---------------- paper.md | 90 +++++++++++++++++--------------------- 2 files changed, 130 insertions(+), 87 deletions(-) diff --git a/paper.bib b/paper.bib index 70702ec..29f6a38 100644 --- a/paper.bib +++ b/paper.bib @@ -44,53 +44,108 @@ @article{Federer2018 } @article{Tedersoo2021, - title={Data sharing practices and data availability upon request differ across scientific disciplines}, - author={Tedersoo, Leho and K{\"u}ngas, Rainer and Oras, Ester and K{\"o}ster, Kajar and Eenmaa, Helen and Leijen, {\"A}li and Pedaste, Margus and Raju, Marju and Astapova, Anastasiya and Lukner, Heli and others}, - journal={Scientific data}, - volume={8}, - number={1}, - pages={192}, - year={2021}, - publisher={Nature Publishing Group UK London} + title = {Data sharing practices and data availability upon request differ across scientific disciplines}, + author = {Tedersoo, Leho and K{\"u}ngas, Rainer and Oras, Ester and K{\"o}ster, Kajar and Eenmaa, Helen and Leijen, {\"A}li and Pedaste, Margus and Raju, Marju and Astapova, Anastasiya and Lukner, Heli and others}, + journal = {Scientific data}, + volume = {8}, + number = {1}, + pages = {192}, + year = {2021}, + publisher = {Nature Publishing Group UK London} } @article{Huston2019, - title={Open science/open data: Reaping the benefits of open data in public health}, - author={Huston, P and Edge, VL and Bernier, E}, - journal={Canada Communicable Disease Report}, - volume={45}, - number={11}, - pages={252}, - year={2019}, - publisher={Public Health Agency of Canada} + title = {Open science/open data: Reaping the benefits of open data in public health}, + author = {Huston, P and Edge, VL and Bernier, E}, + journal = {Canada Communicable Disease Report}, + volume = {45}, + number = {11}, + pages = {252}, + year = {2019}, + publisher = {Public Health Agency of Canada} } @article{Vanderbilt2022, - title={Publishing ecological data in a repository: An easy workflow for everyone}, - author={Vanderbilt, Kristin and Ide, Jon and Gries, Corinna and Grossman-Clarke, Susanne and Hanson, Paul and O'Brien, Margaret and Servilla, Mark and Smith, Colin and Waide, Robert and Zollo-Venecek, Kyle}, - journal={The Bulletin of the Ecological Society of America}, - volume={103}, - number={4}, - pages={e2018}, - year={2022}, - publisher={Wiley Online Library} + title = {Publishing ecological data in a repository: An easy workflow for everyone}, + author = {Vanderbilt, Kristin and Ide, Jon and Gries, Corinna and Grossman-Clarke, Susanne and Hanson, Paul and O'Brien, Margaret and Servilla, Mark and Smith, Colin and Waide, Robert and Zollo-Venecek, Kyle}, + journal = {The Bulletin of the Ecological Society of America}, + volume = {103}, + number = {4}, + pages = {e2018}, + year = {2022}, + publisher = {Wiley Online Library} } -@article{EML2019, title={Ecological Metadata Language version 2.2.0}, url={https://eml.ecoinformatics.org}, DOI={10.5063/f11834t2}, publisher={KNB Data Repository}, author={Jones, Matthew and O’Brien, Margaret and Mecum, Bryce and Boettiger, Carl and Schildhauer, Mark and Maier, Mitchell and Whiteaker, Timothy and Earl, Stevan and Chong, Steven}, year={2019} } +@article{EML2019, + title = {Ecological Metadata Language version 2.2.0}, + url = {https://eml.ecoinformatics.org}, DOI={10.5063/f11834t2}, + publisher = {KNB Data Repository}, + author = {Jones, Matthew and O’Brien, Margaret and Mecum, Bryce and Boettiger, Carl and Schildhauer, Mark and Maier, Mitchell and Whiteaker, Timothy and Earl, Stevan and Chong, Steven}, + year = {2019} } @article{Nelson2022, - title={Memorandum for the heads of executive departments and agencies: Ensuring free, immediate, and equitable access to federally funded research}, - author={Nelson, Alondra and others}, - year={2022}, - publisher={United States. Office of Science and Technology Policy} + title = {Memorandum for the heads of executive departments and agencies: Ensuring free, immediate, and equitable access to federally funded research}, + author = {Nelson, Alondra and others}, + year = {2022}, + publisher = {United States. Office of Science and Technology Policy} } @article{Boettiger2019, - title={Ecological metadata as linked data}, - author={Boettiger, Carl}, - journal={Journal of Open Source Software}, - volume={4}, - number={34}, - pages={1276}, - year={2019} + title = {Ecological metadata as linked data}, + author = {Boettiger, Carl}, + journal = {Journal of Open Source Software}, + volume = {4}, + number = {34}, + pages = {1276}, + year = {2019} +} + +@article{Jones2006, + author = "Jones, Matthew B. and Schildhauer, Mark P. and Reichman, O.J. and Bowers, Shawn", + title = "The New Bioinformatics: Integrating Ecological Data from the Gene to the Biosphere", + journal= "Annual Review of Ecology, Evolution, and Systematics", + year = "2006", + volume = "37", + number = "Volume 37, 2006", + pages = "519-544", + doi = "https://doi.org/10.1146/annurev.ecolsys.37.091305.110031", + url = "https://www.annualreviews.org/content/journals/10.1146/annurev.ecolsys.37.091305.110031", + publisher = "Annual Reviews", + issn = "1545-2069", + type = "Journal Article", + keywords = "semantics", + keywords = "metadata", + keywords = "ecoinformatics", + keywords = "data sharing", + keywords = "ontology", + keywords = "scientific workflows", + keywords = "data integration", + abstract = "Abstract Bioinformatics, the application of computational tools to the management and analysis of biological data, has stimulated rapid research advances in genomics through the development of data archives such as GenBank, and similar progress is just beginning within ecology. One reason for the belated adoption of informatics approaches in ecology is the breadth of ecologically pertinent data (from genes to the biosphere) and its highly heterogeneous nature. The variety of formats, logical structures, and sampling methods in ecology create significant challenges. Cultural barriers further impede progress, especially for the creation and adoption of data standards. Here we describe informatics frameworks for ecology, from subject-specific data warehouses, to generic data collections that use detailed metadata descriptions and formal ontologies to catalog and cross-reference information. Combining these approaches with automated data integration techniques and scientific workflow systems will maximize the value of data and open new frontiers for research in ecology.", + } + +@article{Boettiger2019_emld, + title = {Ecological Metadata as Linked Data. Journal of Open Source Software}, + author = {Carl Boettiger}, + month = {feb}, + year = {2019}, + publisher = {The Open Journals}, + journal = {The Journal of Open Source Software}, + number = {34}, + volume = {4}, + pages = {1276}, + doi = {10.21105/joss.01276}, + url = {https://doi.org/10.21105/joss.01276} +} + +@article{Wilkinson2016, + title = {The FAIR Guiding Principles for scientific data management and stewardship}, + author = {Wilkinson, Mark D. and Dumontier, Michel and Allbersberg, Ijsbrand Jan and Appleton, Gabrielle and Axton, Myles and Baak, Arie and Blomberg, Niklas and Boiten, Jan-Willem and da Silva Santos, Luiz Bonino and Bourne, Philip E and Bouwman, Jildau and Brookes, Anthony J. and Clark, Tim and Crosas, Mercè and Dillo, Ingrid and Dumon, Olivier and Edmunds, Scott and Evelo, Chris T. and Finkers, Richard and Gonzalez-Beltran, Alejandra and Gray, Alasdair J.G. and Groth, Paul and Goble, Carole and Grethe, Jeffrey S. and Heringa, Jaap and ’t Hoen, Peter A.C and Hooft, Rob and Kuhn, Tobias and Kok, Ruben and Kok, Joost and Lusher, Scott J. and Martone, Maryann E. and Mons, Albert and Packer, Abel L. and Persson, Bengt and Rocca-Serra, Philippe and Roos, Marco and van Schaik, Rene and Sansone, Susanna-Assunta and Schultes, Erik and Sengstag, Thierry and Slater, Ted and Strawn, George and Swertz, Morris A. and Thompson, Mark and van der Lei, Johan and van Mulligen, Erik and Velterop, Jan and Waagmeester, Andra and Wittenburg, Peter and Wolstencroft, Katherine and Zhao, Jun and Mons, Barend}, +month = {mar}, +year = {2016}, +journal = {Scientific Data}, +number = {1}, +volume = {3}, +pages = {160018}, +doi = {10.1038/sdata.2016.18}, +url = {https://doi.org/10.1038/sdata.2016.18} } diff --git a/paper.md b/paper.md index 100927d..0869b70 100644 --- a/paper.md +++ b/paper.md @@ -68,85 +68,73 @@ affiliations: # Summary -NPSdataverse is a suite of R packages modeled off of the tidyverse concept of several packages built with a common goal [@Wickham2019]. The overarching theme of the NPSdataverse packages is creating, publishing, and accessing open, machine-readable data and metadata. NPSdataverse supports Ecological Metadata Language (EML) metadata and .csv data files. The NPSdataverse contains some of the constituent packages (R/EML and R/EMLassemblyline) are general-use and aimed at authoring EML documents. Other packages (R/QCkit, R/EMLeditor, R/DPchecker and R/NPSutils) are designed and maintained by the National Park Service. Although many functions within the NPSdataverse packages are NPS-specific (particularly API calls), all of the functions are written so that they can also be used by the general public. Anyone interested applying for research permits or conducting research on National Park Units can reference and utilize the NPSdataverse packages. Additionally, the packages will be useful for data management plans in wide variety of grant proposals and for anyone that needs to create open data and machine readable metadata. Finally, the ability to author, edit, and check EML metadata will be useful for data publication at any number of repositories or data journals. +The [NPSdataverse](https://nationalparkservice.github.io/NPSdataverse/) is a suite of R packages modeled off of the tidyverse concept of several packages built with a common goal [@Wickham2019]. The overarching theme of the NPSdataverse packages is creating, publishing, and accessing open, machine-readable data and metadata. NPSdataverse supports Ecological Metadata Language (EML) metadata and .csv data files. Some of the constituent packages ([R/EML](https://docs.ropensci.org/EML/) and [R/EMLassemblyline](https://ediorg.github.io/EMLassemblyline/)) are general-use and aimed at authoring EML documents. Other packages ([R/QCkit](https://nationalparkservice.github.io/QCkit/), [R/EMLeditor](https://nationalparkservice.github.io/EMLeditor/), [R/DPchecker](https://nationalparkservice.github.io/DPchecker/) and [R/NPSutils](https://nationalparkservice.github.io/NPSutils/)) are designed and maintained by the National Park Service (NPS). Although many functions within the NPSdataverse packages are NPS-specific (particularly some API calls), all of the functions are written so that they can also be used by the general public. Anyone interested applying for research permits or conducting research on NPS Units can reference and utilize the NPSdataverse packages. Additionally, the packages will be useful for data management plans in wide variety of grant proposals and for anyone that needs to create open data and machine readable metadata. Finally, the swiftly and easily ability to author, edit, and check Ecological Metadata Language (EML) metadata in a reproducible fashion will be useful for data publication at any number of repositories or data journals. -# Statement of need +# Statement of Need Following a long-term movement for transparency and data accessibility, the U.S. implemented an Open Data Memorandum in 2013 (OMB M-13-13) and the federal OPEN Government Data Act of 2019 [@OpenData2019]. The Open Data Act mandated that federal agencies provide data in open formats with metadata. Subsequently, many funding agencies such as the National Science Foundation have required grant awardees to make data public, often including metadata ([@nsf2015]). Multiple publishers have followed suit ([@Wiley2022], [@Springer2023])) and require data availability statements upon publication. -One goal of open science, and requirement of the recent "Nelson Memo" is to make data findable, interoperable, accessible, and reuseable ([@Nelson2022]). These goals are often achieved by including structured, machine-readable metadata that conforms to a defined schema along with the data. Ecological Metadata Language Metadata (EML) is one metadata standard that is particularly amenable to studies with rich taxonomy ([@EML2019]). It has been adopted by multiple research organizations including the Ecological Data Initiative (EDI), the National Ecological Observatory Network (NEON), the Global Biodiversity Information Facility (GBIF), Swedish Biodiversity Data Infrastructure (SBDI), the French Biodiversity Hub ("Pole National de Donnees de Biodiversite"), the U.S. National Park Service, and others. +One goal of open science, and requirement of the recent "Nelson Memo" is to make data FAIR: findable, inter-operable, accessible, and reuseable ([@Nelson2022], [@Wilkinson2016]). These goals are often achieved by including structured, machine-readable metadata that conforms to a defined schema along with the data. Ecological Metadata Language Metadata (EML) is one metadata standard that is particularly amenable to studies with rich taxonomy ([@Jones2006], [@EML2019]). It has been adopted by multiple research organizations including the Ecological Data Initiative (EDI), the National Ecological Observatory Network (NEON), the Global Biodiversity Information Facility (GBIF), Swedish Biodiversity Data Infrastructure (SBDI), the French Biodiversity Hub ("Pole National de Donnees de Biodiversite"), the U.S. National Park Service, and others. -Nevertheless, actual availability of data varies ([@Federer2018, @Tedersoo2021], perhaps because there is a need for more infrastructure and tools to meet the goals of open data and open science ([@Huston2019]). Multiple solutions have been presented, including ezEML, a workflow for authoring metadata in Ecological Metadata Language and publishing data and metadata to a repository ([@Vanderbilt2022]). ezEML is has an intuitive graphical user interface with a relatively low learning curve; however, it does have some drawbacks. For instance, ezEML is not scriptable, which makes repeated deployments of the same or similar workflows challenging. And, ezEML requires the user upload their data to an external site for processing, which may not be suitable for sensitive data. Here we introduce the NPSdataverse, a series of R-based packages for authoring, editing, and checking EML metadata locally in a scriptable fashion. Packages within the NPSdataverse leverage early work using R to create and manipulate XML based EML files ([@Boettiger2019]). Building upon that framework, we add user-friendly EML creation workflows; integration with taxonomic databases; fast, easy editing of existing metadata; congruence checks to test correspondence between data and metadata; and integration with public repositories such as DataStore. Packages within the NPSdataverse also include data munging and data access/download functions that leverage the rich EML associated with the data. +Nevertheless, actual availability of data varies ([@Federer2018, @Tedersoo2021], perhaps because there is a need for more infrastructure and tools to meet the goals of open data and open science ([@Huston2019]). Multiple solutions have been presented, including ezEML, a workflow for authoring metadata in Ecological Metadata Language and publishing data and metadata to a repository ([@Vanderbilt2022]). ezEML is has an intuitive graphical user interface with a relatively low learning curve; however, it does have some drawbacks. For instance, ezEML is not scriptable, which makes repeated deployments of the same or similar workflows challenging. And, ezEML requires the user upload their data to an external site for processing, which may not be suitable for sensitive data. Here we introduce the NPSdataverse, a series of R-based packages for authoring, editing, and checking EML metadata locally in a scriptable fashion. Packages within the NPSdataverse leverage earlier work using R to create and manipulate XML based EML files ([@Boettiger2019]). Building upon that framework, we add user-friendly EML creation workflows; integration with taxonomic databases; fast, easy editing of existing metadata; congruence checks to test correspondence between data and metadata; and integration with public repositories such as the National Park Service's [DataStore](https://irma.nps.gov/DataStore/). Packages within the NPSdataverse also include data munging and data access/download functions that leverage the rich EML associated with the data. # NPSdataverse R package -The NPSdataverse package is a meta-package that loads packages within the NPSdataverse into R. NPSdataverse will automatically check that the latest version of the main development branch on GitHub is being loaded. If updates are indicated, the user will be alerted and given instructions on how to update the relevant packages. To prevent API limits on GitHub.com, the package only checks for updates from an interactive R session and will skip checks when the system is not on-line or GitHub.com is not responding. +The [NPSdataverse](https://nationalparkservice.github.io/NPSdataverse/) package is a meta-package that loads packages within the NPSdataverse into R. It provides a convenient way to download many of the packages needed to create and access data packages consisting of rich Ecological Metadata Language metadata and .csv data files: + +```{r install_NPSdataverse, eval = FALSE} +pak::pkg_install("nationalparkservice/NPSdataverse") +``` +NPSdataverse will automatically check that the latest version of the main development branch on GitHub is being loaded. If updates are indicated, the user will be alerted and given instructions on how to update the relevant packages. To prevent API limits on GitHub.com, the package only checks for updates from an interactive R session and will skip checks when the system is not on-line or GitHub.com is not responding. # QCkit R package -QCkit is primarily a data munging package designed to prepare data for metadata creation and publication. QCkit includes functions that can help manage date-time formatting, can check data files for threatened or endangered species, and can help increase interoperability by suggesting appropriate darwinCore column names. Additional functions allow users to convert between decimal latitude and longitude and UTMs, check whether GPS coordinates fall within specific National Park Service unit boundaries, add elevation based on GPS locations via a USGS API, and help deal with "missing values". QCkit also facilitates documenting data munging by generating DataStore references, which can have DOIs attached to them, based on GitHub.com releases. +[QCkit](https://nationalparkservice.github.io/QCkit/) is primarily a data munging package designed to prepare data for metadata creation and publication. QCkit includes functions that can help manage date-time formatting, can check data files for threatened or endangered species, and can help increase inter-operability by suggesting appropriate [Darwin Core](https://dwc.tdwg.org/) standards for naming data. Additional functions allow users to convert between decimal latitude and longitude and UTMs, check whether GPS coordinates fall within specific National Park Service unit boundaries, add elevation based on GPS locations via a USGS API, and help deal with "missing values". QCkit also facilitates documenting data munging by generating DataStore references based on GitHub.com releases. The DataStore references can hold processing scripts or code packages and have DOIs attached to them. # EML R package -The EML package is a fundamental package that allows for importing .xml files, creating and validating validating EML within R, and writing R objects back out to .xml files. R/EML allows for creating fully fledged Ecological Metadata Language Metadata files using nested S3 lists within R while relying on the R/emld package. +The R/[EML](https://docs.ropensci.org/EML/) package is a fundamental package that allows for importing .xml files, creating and validating validating EML within R, and writing R objects back out to .xml files. R/EML allows for creating fully fledged Ecological Metadata Language Metadata files using nested S3 lists within R while relying on the R/[emld](https://docs.ropensci.org/emld/) package [@Boettiger2019_emld]. # EMLassembyline R package -The EMLassemblyline package builds upon R/EML and adds substantial functionality. For instance, EMLassemblyline allows the user to supply .csv files, which are then used to generate template .txt files. Users can adjust the template files as needed and use the `make_eml()` function to generate an R-object that can be exported via R/EML as an EML-fomatted .xml file. EMLassemblyline includes the ability generate entire taxonomic backbones from lists of scientific names via API calls to ITIS, GBIF, or Worms. EMLassemblyline will validate the R object against the EML schema and provide helpful hints on what might have gone wrong during the `make_eml()` process. EMLassemblyline provides an efficient bridge between data and EML metadata for users who are familiar with R but may not be on the EML schema or the detailed nested lists needed to create EML. Products from the EMLassemblyline pipeline are suitable for publication at multiple repositories including the Environmental Data Initiative. +The [EMLassemblyline](https://ediorg.github.io/EMLassemblyline/) (EAL) package builds upon R/EML and adds substantial functionality. For instance, EAL allows the user to supply .csv files, which are used to generate template .txt files. Users can adjust the template files as needed and use the `EMLassemblyline::make_eml()` function to generate an R-object that can be exported via R/EML as an EML-fomatted .xml file. EAL includes the ability generate entire taxonomic backbones from lists of scientific names via API calls to ITIS, GBIF, or Worms. EAL will validate the R object against the EML schema and provide helpful hints on what might have gone wrong during the `EMLassemblyline::make_eml()` process. EAL provides an efficient bridge between data and EML metadata for users who are familiar with R but may not be experts on the EML schema or the detailed nested lists needed to create EML within R via R/EML. Products from the EAL pipeline are suitable for publication at multiple repositories including the Environmental Data Initiative. # EMLeditor R package -The EMLeditor package allows users to quickly and easily view components of metadata in R and make on-the-fly edits to metadata without having to re-run the EMLassemblyline steps (these can be time consuming, especially if there are many taxa that need to be resolved). EMLeditor includes the ability to pick specific licenses (CC0, CC-BY, etc), add ORCIDs, include organizations as authors, and much more. EMLeditor also adds specific content necessary to be compliant with the National Park Service' DataStore. With the proper permissions, EMLeditor can be used to generate draft references and reserve DOIs on DataStore as well as upload data and metadata files to DataStore. Finally, EMLeditor contains a .rmd template file that is accessible in Rstudio under Files > New File > R markdown. The template provides an editable script that walks the user through using EMLassemblyline, EMLeditor, and DPchecker to create and validate EML metadata. +The [EMLeditor](https://nationalparkservice.github.io/EMLeditor/) package allows users to quickly and easily view components of metadata in R and make on-the-fly edits to metadata without having to re-run the EAL steps (EAL can be time consuming, especially if there are many taxa that need to be resolved). EMLeditor includes the ability to pick specific licenses (CC0, CC-BY, etc), add [ORCIDs](https://orcid.org/), include organizations as authors, and much more. EMLeditor also adds specific content necessary to be compliant with NPS's DataStore. With the proper permissions, EMLeditor can be used to generate draft references and reserve DOIs on DataStore as well as upload data and metadata files to DataStore. Finally, EMLeditor contains a .rmd template file that is accessible in Rstudio under Files > New File > R markdown. The template provides an editable script that walks the user through using EAL, EMLeditor, and DPchecker to create and validate EML metadata in R. + +EMLeditor "set" class functions (which all begin with "set_" such as "`EMLeditor::set_abstract()`") will add several NPS-specific items to metadata using their default settings. For instance, these functions will set NPS as the publisher, Fort Collins as the location, and will add a "for or by NPS = TRUE" statement to the metadata. To invoke these functions without adding the NPS-specific metadata elements, set the parameter `NPS = FALSE`. Non-NPS publisher information can be added using the `EMLeditor::set_publisher()` function with the parameters `for_or_by_NPS` and `NPS` set to `FALSE`: + +```{r non-NPS-example, eval=FALSE} +new_metadata1 <- set_abstract(eml_object = old_metadata, + abstract = "This is example/test abstract text", + NPS = FALSE) +new_metadata2 <- set_publisher(eml_object = new_metadata1, + org_name = "My Institution", + street_address = "1234 Sesame St.", + city = "Anytown", + State = "Delaware", + zip_code = "12345", + country = "USA", + URL = "https://www.MyInstitution.us", + email = "publisher@myinstitution.us", + ror_id = "", + for_or_by_NPS = FALSE, + NPS = FALSE) +``` +) # DPchecker R Package -The DPchecker package provides detailed feedback on data-metadata congruence. DPchecker goes beyond validating EML objects in R against the EML schema. Using the `run_congruence_checks` function, DPchecker will conduct a series of 46 checks. These are divided into several categories: Metadd to ensure that metadata are well formatted (file names are not duplicated, all data files are in the metadata file, no data files are not in the metadata file) - - -brief description of the various component packages - -# In-text Citations - -Citations to entries in paper.bib should be in -[rMarkdown](http://rmarkdown.rstudio.com/authoring_bibliographies_and_citations.html) -format. - -If you want to cite a software repository URL (e.g. something on GitHub -without a preferred citation) then you can do it with the example BibTeX -entry below for \@fidgit. - -For a quick reference, the following citation commands can be used: - -`@author:2001` -\> "Author et al. (2001)" - `[@author:2001]` -\> -"(Author et al., 2001)" - `[@author1:2001; @author2:2001]` -\> "(Author1 -et al., 2001; Author2 et al., 2002)" - -# Figures +The [DPchecker](https://nationalparkservice.github.io/DPchecker/) package provides detailed feedback on data-metadata congruence for use by either data package authors and reviewers. DPchecker goes beyond validating EML objects in R against the EML schema. Using the `DPchecker::run_congruence_checks` function, DPchecker will conduct a series of 46 checks. These are divided into several categories: 1) Metadata to ensure that metadata are well formatted (file names are not duplicated, files specify the field delimiter, data files have URLs, the proper delimiter and header row numbers are present, etc. 2) Metadata elements necessary for DataStore automated extraction are present: creators have valid surnames, publication date is present and in the correct format, keywords are present, abstract and methods are present and well formatted, license is present, attributes have definitions, etc. 3) Recommended EML elements are present including ORCiDs and a notes section 4) Metadata and data are in congruence including all files listed in metadata and all metadata file names refer to data files, the columns in the metadata match the columns in the data files, missing fields in data files are properly documented in metadata, columns indicated as numeric in metadata are numeric in the data files, the date format in the metadata matches the date format in the data files, and dates in data files fall within the date ranges given in the metadata and 5) data and metadata compliance including tests for information that should not be released to the public such as non-.gov emails and GPS coordinates if the data package is not set to public. For each test, the data package may fail with an error, fail with a warning, or pass. When warnings and errors are generated, the user is pointed towards the appropriate EMLeditor function to address the problem. DPchecker will often throw a warning even if an item exists and is properly formatted but could by improved to increase the FAIR characteristics of the metadata. For instance, DPchecker will throw a warning if an abstract is less than 20 words long as it is unlikely the creator is able to meaningfully describe the data collection and processing in less than 20 words. -Figures can be included like this: ![Caption for example -figure.](figure.png) and referenced from text using -\autoref{fig:example}. +# NPSutils R Package -Figure sizes can be customized by adding an optional second parameter: -![Caption for example figure.](figure.png){width="20%"} +The [NPSutils](https://nationalparkservice.github.io/NPSutils/) package serves primarily as a way to access data. NPSutils provides avenues for directly downloading data from DataStore using R. NPSutils can also import data downloaded from any repository into R and take advantage of rich EML metadata to call column types. NPSutils provides some basic meta-analysis capability, assuming certain inter-operabilty standards are met (such as consistently naming columns with species or GPS coordinates). NPSutils can also be used to import data and metadata into common data visualization tools such as PowerBI. # Acknowledgements -We acknowledge contributions from across the National Park Service, but -in particular from the Inventory and Monitoring Division. Members of the -NPS Long Term Data Management Governing Board provided critical guidance -and insight (in addition to several of the authors, these include -Kristen Bonebrake, Adam Kozlowski, Ryan Monello, Mark Isley, and Megan -Swan). Justin Mills (currently at U.S. Fish and Wildlife Service) and -Derrick Dardano helped with navigating API and Active Directory -interfaces, Marsha Leavitt made and explained numerous updates to -DataStore. Dan Gussett, Kate Miller, and Pete Budde facilitated software -availability, and Meg White supported and endorsed the project. We are -particularly indebted to our strong user base and their very helpful -feedback including Alison Loar, Christina Appleby, Kirk Sherrill, Lisa -Nelson and Tom Phillipi. Numerous Student Conservation Association -interns made contributions to the code base including Sarah Kelso, James -Brown, and Amy Sherman. Alissa Graff (currently at the Internal Revenue -Service) provided important input on early versions of NPSutils. +We acknowledge contributions from across the National Park Service, but in particular from the Inventory and Monitoring Division. Members of the NPS Long Term Data Management Governing Board provided critical guidance and insight (in addition to several of the authors, these include Kristen Bonebrake, Adam Kozlowski, Ryan Monello, Mark Isley, and Megan Swan). Justin Mills (currently at U.S. Fish and Wildlife Service) and Derrick Dardano helped with navigating API and Active Directory interfaces, Marsha Leavitt made and explained numerous updates to DataStore. Dan Gussett, Kate Miller, and Pete Budde facilitated software availability, and Meg White supported and endorsed the project. We are particularly indebted to our strong user base and their very helpful feedback including Alison Loar, Christina Appleby, Kirk Sherrill, Lisa Nelson and Tom Phillipi. Numerous Student Conservation Association interns made contributions to the code base including Sarah Kelso, James Brown, and Amy Sherman. Alissa Graff (currently at the Internal Revenue Service) provided important input on early versions of NPSutils. # References