Create ecocomDP Data

Each ecocomDP dataset (Level-1; L1) is created from a source dataset (Level-0; L0) by a unique processing function. Inputs are typically from the APIs of data repositories and monitoring networks, and outputs can be a list of R objects or a set of archivable files. The output form you choose may depend on multiple factors including the source data license (it may prohibit archive) and the overall processing time (on-demand processing may be prohibitively long). In either case, the derived ecocomDP dataset is delivered to users in a consistent format by read_data() and the processing function provides a fully reproducible and automated routine for updating the derived dataset whenever a new version of the source data are released.

Below is an example function that reads a source dataset from the Environmental Data Initiative (EDI) repository and converts it into an ecocomDP dataset, which in turn is archived in EDI. For an example of creating an ecocomDP dataset on-demand (i.e. not archived), see the map_neon_data_to_ecocomDP.R source code.

Use view_diagram() and view_descriptions() for an overview of ecocomDP table relationships and requirements.

library(ecocomDP)

For archive in a data repository

Example function:

# -----------------------------------------------------------------------------
# This function converts source dataset "knb-lter-hfr.118" (archived in the EDI
# Data Repository) to ecocomDP dataset "edi.193" (also archived in EDI)
# 
# Arguments:
#
# path        Where the ecocomDP tables will be written
# source_id   Identifier of the source dataset
# derived_id  Identifier of the derived dataset
# url         The URL by which the derived tables and metadata can be accessed 
#             by a data repository. This argument is used when automating the 
#             repository publication step, but not used when manually 
#             publishing.
#
# Value:
#
# tables      (.csv) ecocomDP tables
# metadata    (.xml) EML metadata for tables
# 
# Details:
#             This function facilitates automated updates to the derived 
#             "edi.193" whenever new data are added to the source 
#             "knb-lter-hrf.118". The framework executing this maintenance 
#             routine is hosted on a remote server and jumps into action 
#             whenever an update notification is received for 
#             "knb-lter-hrf.118". The maintenance routine parses the 
#             notification to get the arguments to create_ecocomDP().
#
# Landing page to source dataset "knb-lter-hfr.118":
# https://portal.edirepository.org/nis/mapbrowse?scope=knb-lter-hfr&identifier=118
# Landing page to derived dataset "edi.193":
# https://portal.edirepository.org/nis/mapbrowse?scope=edi&identifier=193
# -----------------------------------------------------------------------------

# Libraries used by this function

library(ecocomDP)
library(xml2)
library(magrittr)
library(data.table)
library(lubridate)
library(tidyr)
library(dplyr)
library(EDIutils)       # remotes::install_github("EDIorg/EDIutils")
library(taxonomyCleanr) # remotes::install_github("EDIorg/taxonomyCleanr")

create_ecocomDP <- function(path,
                            source_id, 
                            derived_id, 
                            url = NULL) {
  
  # Read source dataset -------------------------------------------------------
  
  # The source dataset is about ant communities and their functional traits 
  # changing in response to an invasive species. Observations are made across 
  # habitat types within the Harvard Experimental Forest. The dataset consists 
  # of a primary table listing abundances at sites through time, and an 
  # ancillary table listing physical and functional traits of observed species.
  
  # Read the source dataset from EDI
  
  eml <- EDIutils::api_read_metadata(source_id)
  data <- EDIutils::read_tables(
    eml = eml, 
    strip.white = TRUE,
    na.strings = "",
    convert.missing.value = TRUE, 
    add.units = TRUE)
  
  ants <- data$`hf118-01-ants.csv`
  traits <- data$`hf118-02-functional-traits.csv`
  ants$date <- ymd(ants$date)
  
  # Join and flatten the source dataset ---------------------------------------
  
  # Joining all source data and relevant metadata into one big flat table 
  # simplifies parsing into ecocomDP tables and facilitates referential 
  # integrity in the process.
  
  # Remove duplicate data from the ancillary table and join on species "code"
  
  traits <- traits %>% select(-genus, -species)
  traits <- traits %>% rename(code = species.code)
  wide <- left_join(ants, traits, by = "code")
  
  # Convert wide format to "flat" format. This is the wide form but gathered on 
  # core observation variables, which are often > 1 in source datasets. This 
  # "flat" table is the "widest" ecocomDP datasets can be consistently returned 
  # to by the ecocomDP::flatten_data() function, and is the input format 
  # required by the "create table" helpers we'll meet shortly. This dataset 
  # only has one core observation variable, "abundance", so gathering really 
  # only entails a change of column names.
  
  wide <- wide %>% rename(value_abundance = abundance)
  flat <- pivot_longer(
    wide,
    cols = matches("abundance"), 
    names_to = c(".value", "variable_name"), 
    names_sep = '\\_')
  
  # We're now in a good place to begin adding columns of the ecocomDP tables 
  # we can create from this source dataset. We'll begin with the observation 
  # table.
  
  # Add columns for the observation table -------------------------------------
  
  # The frequency and timing of surveys (events) varied throughout the history 
  # of this dataset and are uniquely identifiable by grouping sample dates by 
  # year and month.
  
  flat$event_id <- flat %>% group_by(month = floor_date(flat$date, "month"),
                                     year) %>% group_indices()
  flat <- flat %>% arrange(event_id)
  
  # Observations are made in plots, which are nested in blocks. Unique 
  # combinations of these form a location
  
  flat$location_id <- flat %>% group_by(plot, block) %>% group_indices()
  
  # Each row of the flattened source dataset represents an observation of taxa 
  # abundance and should have a unique ID for reference
  
  flat$observation_id <- seq(nrow(flat))
  
  # Add columns for the location table ----------------------------------------
  
  # Ideally, the source dataset would include latitude, longitude, and 
  # elevation for each location_id, but all we have are coordinates for the 
  # area encompassing all sampling locations. The best we can do here is use 
  # the middle of the bounding box and mean of the bounding elevations.
  
  geocov <- xml_find_all(eml, ".//geographicCoverage")
  north <- xml_double(xml_find_all(geocov, './/northBoundingCoordinate'))
  east <- xml_double(xml_find_all(geocov, './/eastBoundingCoordinate'))
  south <- xml_double(xml_find_all(geocov, './/southBoundingCoordinate'))
  west <- xml_double(xml_find_all(geocov, './/westBoundingCoordinate'))
  elev_max <- xml_double(xml_find_all(geocov, './/altitudeMaximum'))
  elev_min <- xml_double(xml_find_all(geocov, './/altitudeMinimum'))
  
  flat$latitude <- mean(c(north, south))
  flat$longitude <- mean(c(east, west))
  flat$elevation <- mean(c(elev_max, elev_min))
  
  # Add columns for the taxon table -------------------------------------------
  
  # Taxonomic entities of this dataset are comprised of unique genus and 
  # species pairs
  
  flat <- flat %>% 
    mutate(taxon_name = trimws(paste(genus, species))) %>% 
    select(-genus, -species)
  
  flat$taxon_id <- flat %>% group_by(taxon_name) %>% group_indices()
  
  # While not required, resolving taxonomic entities to an authority system 
  # improves the discoverability and interoperability of the ecocomDP dataset. 
  # We can resolve taxa by sending names through taxonomyCleanr for direct 
  # matches against the Integrated Taxonomic Information System 
  # (ITIS; https://www.itis.gov/).
  
  taxa_resolved <- taxonomyCleanr::resolve_sci_taxa(
    x = unique(flat$taxon_name),
    data.sources = 3)
  
  taxa_resolved <- taxa_resolved %>%
    select(taxa, rank, authority, authority_id) %>%
    rename(taxon_rank = rank,
           taxon_name = taxa,
           authority_system = authority,
           authority_taxon_id = authority_id)
  
  flat <- left_join(flat, taxa_resolved, by = "taxon_name")
  
  # Add columns for the dataset_summary table ---------------------------------
  
  dates <- flat$date %>% stats::na.omit() %>% sort()
  
  # Use the calc_*() helper functions for consistency
  
  flat$package_id <- derived_id
  flat$original_package_id <- source_id
  flat$length_of_survey_years <- ecocomDP::calc_length_of_survey_years(dates)
  flat$number_of_years_sampled <- ecocomDP::calc_number_of_years_sampled(dates)
  flat$std_dev_interval_betw_years <- 
    ecocomDP::calc_std_dev_interval_betw_years(dates)
  flat$max_num_taxa <- length(unique(flat$taxon_name))
  flat$geo_extent_bounding_box_m2 <- 
    ecocomDP::calc_geo_extent_bounding_box_m2(west, east, north, south)
  
  # Odds and ends -------------------------------------------------------------
  
  # Rename source columns with an ecocomDP equivalent (date to datetime)
  
  flat <- flat %>% rename(datetime = date)
  
  # The hard work is done! The flat table contains all the source data and 
  # more! We can now use the "create" functions to parse this table into the 
  # ecocomDP tables.
  
  # Parse flat into ecocomDP tables -------------------------------------------
  
  # Each ecocomDP table has an associated "create" function. Begin with the 
  # core required tables.
  
  observation <- ecocomDP::create_observation(
    L0_flat = flat, 
    observation_id = "observation_id", 
    event_id = "event_id", 
    package_id = "package_id",
    location_id = "location_id", 
    datetime = "datetime", 
    taxon_id = "taxon_id", 
    variable_name = "variable_name",
    value = "value",
    unit = "unit")
  
  location <- ecocomDP::create_location(
    L0_flat = flat, 
    location_id = "location_id", 
    location_name = c("block", "plot"), 
    latitude = "latitude", 
    longitude = "longitude", 
    elevation = "elevation")
  
  taxon <- ecocomDP::create_taxon(
    L0_flat = flat, 
    taxon_id = "taxon_id", 
    taxon_rank = "taxon_rank", 
    taxon_name = "taxon_name", 
    authority_system = "authority_system", 
    authority_taxon_id = "authority_taxon_id")
  
  dataset_summary <- ecocomDP::create_dataset_summary(
    L0_flat = flat, 
    package_id = "package_id", 
    original_package_id = "original_package_id", 
    length_of_survey_years = "length_of_survey_years",
    number_of_years_sampled = "number_of_years_sampled", 
    std_dev_interval_betw_years = "std_dev_interval_betw_years", 
    max_num_taxa = "max_num_taxa", 
    geo_extent_bounding_box_m2 = "geo_extent_bounding_box_m2")
  
  # Create the ancillary ecocomDP tables. These are optional, but should be 
  # included if possible.
  
  observation_ancillary <- ecocomDP::create_observation_ancillary(
    L0_flat = flat,
    observation_id = "observation_id", 
    variable_name = c("trap.type", "trap.num", "moose.cage"))
  
  location_ancillary <- ecocomDP::create_location_ancillary(
    L0_flat = flat,
    location_id = "location_id",
    variable_name = "treatment")
  
  taxon_ancillary <- ecocomDP::create_taxon_ancillary(
    L0_flat = flat,
    taxon_id = "taxon_id",
    variable_name = c(
      "subfamily", "hl", "rel", "rll", "colony.size", 
      "feeding.preference", "nest.substrate", "primary.habitat", 
      "secondary.habitat", "seed.disperser", "slavemaker.sp", 
      "behavior", "biogeographic.affinity", "source"),
    unit = c("unit_hl", "unit_rel", "unit_rll"))
  
  # Create the variable_mapping table. This is optional but highly recommended 
  # as it provides unambiguous definitions to variables and facilitates 
  # integration with other ecocomDP datasets.
  
  variable_mapping <- ecocomDP::create_variable_mapping(
    observation = observation,
    observation_ancillary = observation_ancillary,
    location_ancillary = location_ancillary, 
    taxon_ancillary = taxon_ancillary)
  
  i <- variable_mapping$variable_name == 'abundance'
  variable_mapping$mapped_system[i] <- 'Darwin Core'
  variable_mapping$mapped_id[i] <- 'http://rs.tdwg.org/dwc/terms/individualCount'
  variable_mapping$mapped_label[i] <- 'individualCount'
  
  i <- variable_mapping$variable_name == 'treatment'
  variable_mapping$mapped_system[i] <- 'The Ecosystem Ontology'
  variable_mapping$mapped_id[i] <- 'http://purl.dataone.org/odo/ECSO_00000506'
  variable_mapping$mapped_label[i] <- 'Manipulative experiment'
  
  i <- variable_mapping$variable_name == 'trap.type'
  variable_mapping$mapped_system[i] <- 'The Ecosystem Ontology'
  variable_mapping$mapped_id[i] <- 'http://purl.dataone.org/odo/ECSO_00001591'
  variable_mapping$mapped_label[i] <- 'type of trap'
  
  i <- variable_mapping$variable_name == 'hl'
  variable_mapping$mapped_system[i] <- 'Darwin Core'
  variable_mapping$mapped_id[i] <- 'http://rs.tdwg.org/dwc/terms/measurementType'
  variable_mapping$mapped_label[i] <- 'measurementType'
  
  i <- variable_mapping$variable_name == 'rel'
  variable_mapping$mapped_system[i] <- 'Darwin Core'
  variable_mapping$mapped_id[i] <- 'http://rs.tdwg.org/dwc/terms/measurementType'
  variable_mapping$mapped_label[i] <- 'measurementType'
  
  i <- variable_mapping$variable_name == 'rll'
  variable_mapping$mapped_system[i] <- 'Darwin Core'
  variable_mapping$mapped_id[i] <- 'http://rs.tdwg.org/dwc/terms/measurementType'
  variable_mapping$mapped_label[i] <- 'measurementType'
  
  i <- variable_mapping$variable_name == 'colony.size'
  variable_mapping$mapped_system[i] <- 'The Ecosystem Ontology'
  variable_mapping$mapped_id[i] <- 'http://purl.dataone.org/odo/ECSO_00000311'
  variable_mapping$mapped_label[i] <- 'Population'
  
  i <- variable_mapping$variable_name == 'feeding.preference'
  variable_mapping$mapped_system[i] <- 'Darwin Core'
  variable_mapping$mapped_id[i] <- 'http://rs.tdwg.org/dwc/terms/behavior'
  variable_mapping$mapped_label[i] <- 'behavior'
  
  i <- variable_mapping$variable_name == 'primary.habitat'
  variable_mapping$mapped_system[i] <- 'The Ecosystem Ontology'
  variable_mapping$mapped_id[i] <- 'http://purl.dataone.org/odo/ECSO_00002736'
  variable_mapping$mapped_label[i] <- 'type of habitat'
  
  i <- variable_mapping$variable_name == 'secondary.habitat'
  variable_mapping$mapped_system[i] <- 'The Ecosystem Ontology'
  variable_mapping$mapped_id[i] <- 'http://purl.dataone.org/odo/ECSO_00002736'
  variable_mapping$mapped_label[i] <- 'type of habitat'
  
  i <- variable_mapping$variable_name == 'seed.disperser'
  variable_mapping$mapped_system[i] <- 'Darwin Core'
  variable_mapping$mapped_id[i] <- 'http://rs.tdwg.org/dwc/terms/behavior'
  variable_mapping$mapped_label[i] <- 'behavior'
  
  i <- variable_mapping$variable_name == 'slavemaker.sp'
  variable_mapping$mapped_system[i] <- 'Darwin Core'
  variable_mapping$mapped_id[i] <- 'http://rs.tdwg.org/dwc/terms/behavior'
  variable_mapping$mapped_label[i] <- 'behavior'
  
  i <- variable_mapping$variable_name == 'behavior'
  variable_mapping$mapped_system[i] <- 'Darwin Core'
  variable_mapping$mapped_id[i] <- 'http://rs.tdwg.org/dwc/terms/behavior'
  variable_mapping$mapped_label[i] <- 'behavior'
  
  i <- variable_mapping$variable_name == 'biogeographic.affinity'
  variable_mapping$mapped_system[i] <- 'The Ecosystem Ontology'
  variable_mapping$mapped_id[i] <- 'http://purl.dataone.org/odo/ECSO_00002736'
  variable_mapping$mapped_label[i] <- 'type of habitat'
  
  i <- variable_mapping$variable_name == 'source'
  variable_mapping$mapped_system[i] <- 'Darwin Core'
  variable_mapping$mapped_id[i] <- 'http://purl.org/dc/terms/references'
  variable_mapping$mapped_label[i] <- 'references'
  
  # Write tables to file
  
  ecocomDP::write_tables(
    path = path, 
    observation = observation, 
    location = location,
    taxon = taxon,
    dataset_summary = dataset_summary, 
    observation_ancillary = observation_ancillary,
    location_ancillary = location_ancillary, 
    taxon_ancillary = taxon_ancillary, 
    variable_mapping = variable_mapping)
  
  # Validate tables -----------------------------------------------------------
  
  # Validation checks ensure the derived set of tables comply with the ecocomDP
  # model. Any issues at this point 
  # should be addressed in the lines of code above, the tables rewritten, and 
  # another round of validation, to be certain the fix worked.
  
  issues <- ecocomDP::validate_data(path = path)
  
  # Create metadata -----------------------------------------------------------
  
  # Before publishing the derived ecocomDP dataset, we need to describe it. The 
  # create_eml() function does this all for us. It knows the structure of the 
  # ecocomDP model and applies standardized table descriptions and mixes in 
  # important elements of the source dataset metadata for purposes of 
  # communication and provenance tracking.
  
  # Convert "dataset level keywords" listed in the source to "dataset level 
  # annotations" in the derived. The predicate "is about" is used, which 
  # results in an annotation that reads "This dataset is about 'species 
  # abundance'", "This dataset is about an ecological 'Community'", etc. All 
  # source datasets involving a human induced manipulative experiment, not a 
  # natural disturbance/experiment, should include the "Manipulative 
  # experiment" annotation below to enable searching on this term.
  
  dataset_annotations <- c(
    `species abundance` = 
      "http://purl.dataone.org/odo/ECSO_00001688",
    Community = 
      "http://purl.dataone.org/odo/ECSO_00000310",
    `Manipulative experiment` = 
      "http://purl.dataone.org/odo/ECSO_00000506",
    `level of ecological disturbance` = 
      "http://purl.dataone.org/odo/ECSO_00002588",
    `type of ecological disturbance` = 
      "http://purl.dataone.org/odo/ECSO_00002589")
  
  # Add contact information for the author of this script and dataset
  
  additional_contact <- data.frame(
    givenName = 'Colin',
    surName = 'Smith',
    organizationName = 'Environmental Data Initiative',
    electronicMailAddress = 'ecocomdp@gmail.com',
    stringsAsFactors = FALSE)
  
  # Create EML metadata
  
  eml <- ecocomDP::create_eml(
    path = path,
    source_id = source_id,
    derived_id = derived_id,
    is_about = dataset_annotations,
    script = "create_ecocomDP.R",
    script_description = 
      "A function for converting knb-lter-hrf.118 to ecocomDP",
    contact = additional_contact,
    user_id = 'ecocomdp',
    user_domain = 'EDI',
    basis_of_record = "HumanObservation")
  
}

Example function call:


# Create directory for tables and metadata

mypath <- paste0(tempdir(), "/edi_193")
dir.create(mypath)

# Create ecocomDP dataset "edi.193.5" from source dataset "knb-lter-hfr.118.33"

create_ecocomDP(
  path = mypath, 
  source_id = "knb-lter-hfr.118.33", 
  derived_id = "edi.193.5")
#> Retrieving EML for data package knb-lter-hfr.118.33
#>  [0%] Downloaded 0 bytes...
#>  [0%] Downloaded 0 bytes...
#>
#> Searching ITIS for "Aphaenogaster picea"
#> Searching ITIS for "Camponotus novaeboracensis"
#> Searching ITIS for "Aphaenogaster fulva"
#> Searching ITIS for "Temnothorax longispinosus"
#> Searching ITIS for "Stenemma impar"
#> Searching ITIS for "Stenemma diecki"
#> Searching ITIS for "Camponotus pennsylvanicus"
#> Searching ITIS for "Lasius americanus"
#> Searching ITIS for "Myrmica punctiventris"
#> Searching ITIS for "Lasius nearcticus"
#> Searching ITIS for "Formica subaenescens"
#> Searching ITIS for "Lasius umbratus"
#> Searching ITIS for "Formica subsericea"
#> Searching ITIS for "Formica aserva"
#> Searching ITIS for "Formica neogagates"
#> Searching ITIS for "Camponotus nearcticus"
#> Searching ITIS for "Ponera pennsylvanica"
#> Searching ITIS for "Stenamma brevicorne"
#> Searching ITIS for "Lasius claviger"
#> Searching ITIS for "Stenamma impar"
#> Searching ITIS for "Temnothorax lognispinosus"
#> Searching ITIS for "Stenamma diecki"
#> Searching ITIS for "Camponotus herculeanus"
#> Searching ITIS for "Lasius speculiventris"
#> Searching ITIS for "Stenamma schmitti"
#> Searching ITIS for "Lasius neoniger"
#> Searching ITIS for "Camponotus pennsylvanica"
#> Searching ITIS for "Tapinoma sessile"
#> Searching ITIS for "Myrmica AF-smi"
#> Searching ITIS for "Formica neorufibarbis"
#> Searching ITIS for "Myrmica incompleta"
#> Searching ITIS for "Formica argentea"
#> Searching ITIS for "Myrmica AF-scu"
#> Searching ITIS for "Formica dolosa"
#> Searching ITIS for "Formica subintegra"
#> Searching ITIS for "Formica incerta"
#> Searching ITIS for "Myrmica nearctica"
#> Searching ITIS for "Formica pergandei"
#> Searching ITIS for "Formica lasioides"
#> Searching ITIS for "Myrmica pinetorum"
#> Searching ITIS for "Leptothorax canadensis"
#> Searching ITIS for "Myrmica detritinodis"
#> Searching ITIS for "Myrmecina americana"
#> Searching ITIS for "Crematogaster lineolata"
#> Searching ITIS for "Lasius interjectus"
#> Searching ITIS for "Camponotus chromaiodes"
#> Searching ITIS for "Formica pallidefulva"
#> Searching ITIS for "Temnothorax ambiguus"
#> Searching ITIS for "Lasius subglaber"
#> Searching ITIS for "Formica rubicunda"
#> Searching ITIS for "Lasius brevicornis"
#> Searching ITIS for "Lasius aphidicolus"
#> Searching ITIS for "Formica integra"
#>
#> Writing tables to file:
#>   observation
#>   location
#>   taxon
#>   dataset_summary
#>   observation_ancillary
#>   location_ancillary
#>   taxon_ancillary
#>   variable_mapping
#>
#> Validating edi_193:
#>   Required tables
#>   Column names
#>   Required columns
#>   Column classes
#>   Datetime formats
#>   Primary keys
#>   Composite keys
#>   Referential integrity
#>   Latitude and longitude format
#>   Latitude and longitude range
#>   Elevation
#>   variable_mapping
#>
#> Creating EML for derived data package (edi.193.5)
#> Reading EML of L0 data package knb-lter-hfr.118.33
#> Creating EML of L1 data package edi.193.5
#> Updating:
#> <eml>
#>   <dataset>
#>     <alternateIdentifier>
#>     <title>
#>     <pubDate>
#>     <keywordSet>
#>     <contact>
#>     <methods>
#>     <dataTable>
#>     <otherEntity>
#>     <annotations>
#> </eml>
#> Writing EML
#> Validating EML
#>   Validation passed :)
#> Done.

# The working directory contains a valid set of ecocomDP tables and metadata, 
# which is ready for upload to EDI (or any other EML based repository)
dir(mypath)
#>  [1] "create_ecocomDP.R"        
#>  [2] "dataset_summary.csv"      
#>  [3] "edi.193.5.xml"            
#>  [4] "location.csv"             
#>  [5] "location_ancillary.csv"   
#>  [6] "observation.csv"          
#>  [7] "observation_ancillary.csv"
#>  [8] "taxon.csv"                
#>  [9] "taxon_ancillary.csv"      
#> [10] "variable_mapping.csv"