Examples

1. Detecting ambiguity and inconsistencies

The first example illustrates how to detect ambiguity and inconsistency in a merged taxonomy. Start by loading the 2000 row sample dataset that comes with taxonbridge:

library(taxonbridge)
sample <- load_sample()
dim(sample)
#> [1] 2000   29

Next, retrieve all rows that have lineage information in both the GBIF backbone and NCBI:

lineages <- get_lineages(sample)

Then validate the lineages by using the kingdom and family taxonomic ranks, and create a list of the resulting tibble(s). Note that phylum, class, and order may also be used. In this example, entries that failed validation are returned by setting valid = FALSE.

kingdom <- get_validity(lineages, rank = "kingdom", valid = FALSE)
#> Term conversion carried out on kingdom taxonomic rank
family <- get_validity(lineages, rank = "family", valid = FALSE)
candidates <- list(kingdom, family)

Finally, detect candidate incongruencies (excluding those with uninomial scientific names):

get_inconsistencies(candidates, uninomials = FALSE)
#> [1] "Gordonia neofelifaecis"  "Attheya septentrionalis"

Two binomial names exhibit incongruency. Upon reference to the literature and the individual entries it can be seen that:

Attheya septentrionalis is assigned to different families of the problematica order Chaetocerotales
Gordonia neofelifaecis is a plant (family: Theaceae) in the GBIF but a bacterium in the NCBI (family: Gordoniaceae)

Attheya septentrionalis has the status “synonym” in the GBIF data:

lineages[lineages$canonicalName=="Attheya septentrionalis", "taxonomicStatus"]
#> # A tibble: 1 × 1
#>   taxonomicStatus
#>   <chr>          
#> 1 synonym

Applying the get_status() function and rerunning the exercise leaves only Gordonia neofelifaecis as a binomial incongruency with biological provenance:

lineages <- get_status(get_lineages(sample), status = "accepted")
kingdom <- get_validity(lineages, rank = "kingdom", valid = FALSE)
#> Term conversion carried out on kingdom taxonomic rank
family <- get_validity(lineages, rank = "family", valid = FALSE)
candidates <- list(kingdom, family)
get_inconsistencies(candidates, uninomials = FALSE)
#> [1] "Gordonia neofelifaecis"

2. Annotating a custom taxonomy

Again, start by loading the 2000 row sample dataset that comes with taxonbridge. Then apply the get_taxa() method to find all decapod crustaceans in the sample dataset:

library(taxonbridge)
sample <- load_sample()
decapoda <- get_taxa(sample, order = "decapoda")

The decapoda object will serve as your base taxonomy. Create a new object that only retains decapods known as swimming crabs:

swimming_crabs <- get_taxa(sample, family = "portunidae")

Next annotate your base taxonomy with this colloquial name for the family Portunidae:

decapoda <- annotate(decapoda, names = swimming_crabs$canonicalName, 
                     new_column = "swimming_crabs", present = "1")

A new column by the name “swimming_crabs” has been added to the base taxonomy:

colnames(decapoda)
#>  [1] "taxonID"              "canonicalName"        "taxonRank"           
#>  [4] "parentNameUsageID"    "acceptedNameUsageID"  "originalNameUsageID" 
#>  [7] "taxonomicStatus"      "kingdom"              "phylum"              
#> [10] "class"                "order"                "family"              
#> [13] "genericName"          "specificEpithet"      "infraspecificEpithet"
#> [16] "from_GBIF"            "ncbi_id"              "ncbi_lineage_names"  
#> [19] "ncbi_lineage_ids"     "ncbi_rank"            "ncbi_lineage_ranks"  
#> [22] "ncbi_kingdom"         "ncbi_phylum"          "ncbi_class"          
#> [25] "ncbi_order"           "ncbi_family"          "ncbi_genus"          
#> [28] "ncbi_species"         "from_NCBI"            "swimming_crabs"

Since only the present parameter and not the absent parameter was passed to annotate(), all species that are not members of the Portunidae will be assigned NA by default in the swimming_crabs column. Swimming crabs can therefore be retrieved from the base taxonomy with the following command:

decapoda[!is.na(decapoda$swimming_crabs),"canonicalName"]
#> # A tibble: 1 × 1
#>   canonicalName      
#>   <chr>              
#> 1 Callinectes sapidus

3. Visualizing a custom taxonomy

Continue using the annotated base taxonomy from example 2. Prepare two distributions for the base taxonomy using prepare_rank_dist():

GBIF_dist <- prepare_rank_dist(decapoda, GBIF = TRUE)
NCBI_dist <- prepare_rank_dist(decapoda, NCBI = TRUE)
plot_mdb(GBIF_dist)
plot_mdb(NCBI_dist)

The plots show that there is a difference between the entries of the NCBI and GBIF. Looking at the previously prepared distributions reveal that the NCBI lacks lineage data for two species:

GBIF_dist
#> $GBIF
#> # A tibble: 3 × 2
#>   Rank    Frequency
#>   <chr>       <int>
#> 1 family          1
#> 2 genus           4
#> 3 species        13
#> 
#> attr(,"class")
#> [1] "one_rank"
NCBI_dist 
#> $NCBI
#> # A tibble: 4 × 2
#>   Rank    Frequency
#>   <chr>       <int>
#> 1 family          1
#> 2 genus           4
#> 3 species        11
#> 4 <NA>            2
#> 
#> attr(,"class")
#> [1] "one_rank"

Assuring that both the NCBI data and GBIF data have lineage data by using get_lineages() solves this problem at the cost of losing two GBIF entries that are not available in the NCBI:

lineages <- get_lineages(decapoda)
GBIF_dist <- prepare_rank_dist(lineages, GBIF = TRUE)
NCBI_dist <- prepare_rank_dist(lineages, NCBI = TRUE)
plot_mdb(GBIF_dist)
plot_mdb(NCBI_dist)

Note that get_lineages() should be used with care since extinct species in the GBIF are unlikely to have lineage data in the NCBI.

Now that both the GBIF and NCBI data have lineage information, the validity of the lineage information can be accessed in the same way as was done in example 1:

get_validity(lineages, valid = FALSE)
#> # A tibble: 3 × 30
#>   taxonID  canonicalName         taxonRank parentNameUsageID acceptedNameUsageID
#>   <chr>    <chr>                 <chr>     <chr>             <chr>              
#> 1 11343777 Orithyia              genus     6316              2117279            
#> 2 2221665  Palibythus magnificus species   2221664           <NA>               
#> 3 2226015  Lysirude              genus     6927319           <NA>               
#> # … with 25 more variables: originalNameUsageID <chr>, taxonomicStatus <chr>,
#> #   kingdom <chr>, phylum <chr>, class <chr>, order <chr>, family <chr>,
#> #   genericName <chr>, specificEpithet <chr>, infraspecificEpithet <chr>,
#> #   from_GBIF <chr>, ncbi_id <chr>, ncbi_lineage_names <chr>,
#> #   ncbi_lineage_ids <chr>, ncbi_rank <chr>, ncbi_lineage_ranks <chr>,
#> #   ncbi_kingdom <chr>, ncbi_phylum <chr>, ncbi_class <chr>, ncbi_order <chr>,
#> #   ncbi_family <chr>, ncbi_genus <chr>, ncbi_species <chr>, from_NCBI <chr>, …

Three entries have invalid data:

The genus Orithyia belongs to the family Phoxichilidiidae in the GBIF but belongs to the family Orithyiidae in the NCBI.
The species Palibythus magnificus belongs to the family Palinuridae in the GBIF but belongs to the family Synaxidae in the NCBI.
The genus Lysirude belongs to the family Lyreididae in the GBIF but belongs to the family Raninidae in the NCBI.

Annotating the base taxonomy with these inconsistencies is a good idea:

decapoda <- annotate(decapoda, get_validity(lineages, valid = FALSE)$canonicalName,
                    new_column = "family_inconsistencies", present = 1)
#> 3 annotations were made.
colnames(decapoda)
#>  [1] "taxonID"                "canonicalName"          "taxonRank"             
#>  [4] "parentNameUsageID"      "acceptedNameUsageID"    "originalNameUsageID"   
#>  [7] "taxonomicStatus"        "kingdom"                "phylum"                
#> [10] "class"                  "order"                  "family"                
#> [13] "genericName"            "specificEpithet"        "infraspecificEpithet"  
#> [16] "from_GBIF"              "ncbi_id"                "ncbi_lineage_names"    
#> [19] "ncbi_lineage_ids"       "ncbi_rank"              "ncbi_lineage_ranks"    
#> [22] "ncbi_kingdom"           "ncbi_phylum"            "ncbi_class"            
#> [25] "ncbi_order"             "ncbi_family"            "ncbi_genus"            
#> [28] "ncbi_species"           "from_NCBI"              "swimming_crabs"        
#> [31] "family_inconsistencies"
decapoda[!is.na(decapoda$family_inconsistencies),"canonicalName"]
#> # A tibble: 3 × 1
#>   canonicalName        
#>   <chr>                
#> 1 Orithyia             
#> 2 Palibythus magnificus
#> 3 Lysirude

4. Retrieving information for a list of non-specific scientific names

In this example a researcher is in possession of a list of scientific names of cone snails. The researcher has a three-fold concern. Firstly, some names in the list are non-specific since the researcher would like to include all species in a genus (e.g., “genus sp”). Secondly, the researcher is aware that there are changes in the taxonomic status of cone snails from time to time and would like to detect new or changed taxonomic entries. Lastly, the researcher would like to retrieve the associated GBIF and NCBI identifiers for all entries as well as the taxonomic status of every entry.

NOTE: The following script requires Taxonkit and may take some time to complete execution.

library(taxonbridge)

#Retrieve and merge NCBI and GBIF data. INSERT PATH TO YOUR TAXONKIT INSTALLATION.
custom_taxonomy <- load_taxonomies(download_gbif(), 
                                   download_ncbi(taxonkitpath = "/path/to/taxonkit"))

#Create a custom taxonomy of all gastropods
custom_taxonomy <- get_taxa(custom_taxonomy, class = "gastropoda")

#Use fuzzy_search to find occurrences of the names within the custom taxonomy
search_result <- c()
sp_names <- c("Natica sp", "Conus sp")
for (i in sp_names) {
  iter <- fuzzy_search(custom_taxonomy, i, allow_term_removal = TRUE)
  search_result <- c(search_result, iter)
}
exact_names <- c("Polinices mammillata", "Cymatium pileare",
                 "Chicoreus ramosus", "Murex tenuirostris","Vasum turbinellum",
                 "Oliva amethystina", "Mitra mitra", "Nassa serta",
                 "Phos senticosus")
for (i in exact_names) {
  iter <- fuzzy_search(custom_taxonomy, i, allow_term_removal = FALSE)
  search_result <- c(search_result, iter)
}

#Annotate the custom taxonomy
custom_taxonomy <- annotate(custom_taxonomy, names = search_result, new_column = "cone_snails")

#Filter on the annotation
custom_taxonomy <- custom_taxonomy[!is.na(custom_taxonomy$cone_snails),]

#De-duplicate the custom taxonomy
custom_taxonomy <- dedupe(custom_taxonomy, ranked = TRUE)

#Create a subset of relevant data
custom_taxonomy_short <- custom_taxonomy[, c(1,17,2,3,20,7)]
colnames(custom_taxonomy_short) <- c("GBIF_id", "NCBI_id", "species_name", 
                                     "GBIF_rank", "NCBI_rank", "taxonomic_status")

#Print the results to the terminal
print(custom_taxonomy_short, n=100)

The above script will return a tibble of candidate cone snails with their associated NCBI and GBIF identifiers as well as their associated taxonomic statuses. Since the fuzzy search method was used, each entry should be manually assessed to rule out false positives.

Examples

Table of contents

1. Detecting ambiguity and inconsistencies

2. Annotating a custom taxonomy

3. Visualizing a custom taxonomy

4. Retrieving information for a list of non-specific scientific names