This tutorial shows how to use RVenn
, a package for dealing with multiple sets. The base R functions (intersect
, union
and setdiff
) only work with two sets. %>%
can be used from magrittr
but, for many sets this can be tedious. reduce
function from purrr
package also provides a solution, which is the function that is used for set operations in this package. The functions overlap
, unite
and discern
abstract away the details, so one can just construct the universe and choose the sets to operate by index or set name. Further, by using ggvenn
Venn diagram can be drawn for 2-3 sets. As you can notice from the name of the function, ggvenn
is based on ggplot2
, so it is a neat way to show the relationship among a reduced number sets. For many sets, it is much better to use UpSet or setmap
function provided within this package. Finally, by using enrichment_test
function, the p-value of an overlap between two sets can be calculated. Here, the usage of all these functions will be shown.
This chunk of code will create 10 sets with sizes ranging from 5 to 25.
set.seed(42)
toy = map(sample(5:25, replace = TRUE, size = 10),
function(x) sample(letters, size = x))
toy[1:3] # First 3 of the sets.
#> [[1]]
#> [1] "l" "r" "w" "f" "k" "t" "u" "c" "i" "j" "o" "s" "n" "m" "a" "x" "d"
#> [18] "y" "q" "v" "e" "g" "b" "p"
#>
#> [[2]]
#> [1] "a" "u" "z" "e" "t" "m" "h" "i" "x" "q" "g" "o" "y" "s" "l" "p" "d"
#> [18] "j" "n" "f" "r" "v" "c" "k"
#>
#> [[3]]
#> [1] "g" "m" "q" "w" "x" "l" "v" "d" "e" "o" "u"
Intersection of all sets:
Intersection of selected sets (chosen with set names or indices, respectively):
overlap_pairs(toy, slice = 1:4)
#> $Set_1...Set_2
#> [1] "l" "r" "f" "k" "t" "u" "c" "i" "j" "o" "s" "n" "m" "a" "x" "d" "y"
#> [18] "q" "v" "e" "g" "p"
#>
#> $Set_1...Set_3
#> [1] "l" "w" "u" "o" "m" "x" "d" "q" "v" "e" "g"
#>
#> $Set_1...Set_4
#> [1] "l" "w" "f" "k" "t" "u" "c" "i" "o" "s" "n" "a" "x" "d" "y" "q" "e"
#> [18] "g" "b" "p"
#>
#> $Set_2...Set_3
#> [1] "u" "e" "m" "x" "q" "g" "o" "l" "d" "v"
#>
#> $Set_2...Set_4
#> [1] "a" "u" "z" "e" "t" "h" "i" "x" "q" "g" "o" "y" "s" "l" "p" "d" "n"
#> [18] "f" "c" "k"
#>
#> $Set_3...Set_4
#> [1] "g" "q" "w" "x" "l" "d" "e" "o" "u"
Union of all sets:
unite(toy)
#> [1] "l" "r" "w" "f" "k" "t" "u" "c" "i" "j" "o" "s" "n" "m" "a" "x" "d"
#> [18] "y" "q" "v" "e" "g" "b" "p" "z" "h"
Union of selected sets (chosen with set names or indices, respectively):
unite_pairs(toy, slice = 1:4)
#> $Set_1...Set_2
#> [1] "l" "r" "w" "f" "k" "t" "u" "c" "i" "j" "o" "s" "n" "m" "a" "x" "d"
#> [18] "y" "q" "v" "e" "g" "b" "p" "z" "h"
#>
#> $Set_1...Set_3
#> [1] "l" "r" "w" "f" "k" "t" "u" "c" "i" "j" "o" "s" "n" "m" "a" "x" "d"
#> [18] "y" "q" "v" "e" "g" "b" "p"
#>
#> $Set_1...Set_4
#> [1] "l" "r" "w" "f" "k" "t" "u" "c" "i" "j" "o" "s" "n" "m" "a" "x" "d"
#> [18] "y" "q" "v" "e" "g" "b" "p" "z" "h"
#>
#> $Set_2...Set_3
#> [1] "a" "u" "z" "e" "t" "m" "h" "i" "x" "q" "g" "o" "y" "s" "l" "p" "d"
#> [18] "j" "n" "f" "r" "v" "c" "k" "w"
#>
#> $Set_2...Set_4
#> [1] "a" "u" "z" "e" "t" "m" "h" "i" "x" "q" "g" "o" "y" "s" "l" "p" "d"
#> [18] "j" "n" "f" "r" "v" "c" "k" "b" "w"
#>
#> $Set_3...Set_4
#> [1] "g" "m" "q" "w" "x" "l" "v" "d" "e" "o" "u" "b" "k" "a" "z" "i" "s"
#> [18] "c" "h" "t" "f" "n" "p" "y"
discern_pairs(toy, slice = 1:4)
#> $Set_1...Set_2
#> [1] "w" "b"
#>
#> $Set_1...Set_3
#> [1] "r" "f" "k" "t" "c" "i" "j" "s" "n" "a" "y" "b" "p"
#>
#> $Set_1...Set_4
#> [1] "r" "j" "m" "v"
#>
#> $Set_2...Set_3
#> [1] "a" "z" "t" "h" "i" "y" "s" "p" "j" "n" "f" "r" "c" "k"
#>
#> $Set_2...Set_4
#> [1] "m" "j" "r" "v"
#>
#> $Set_3...Set_4
#> [1] "m" "v"
#>
#> $Set_2...Set_1
#> [1] "z" "h"
#>
#> $Set_3...Set_1
#> character(0)
#>
#> $Set_4...Set_1
#> [1] "z" "h"
#>
#> $Set_3...Set_2
#> [1] "w"
#>
#> $Set_4...Set_2
#> [1] "b" "w"
#>
#> $Set_4...Set_3
#> [1] "b" "k" "a" "z" "i" "s" "c" "h" "t" "f" "n" "p" "y"
For two sets:
For three sets:
Without clustering
er = enrichment_test(toy, 6, 7)
er$Significance
#> [1] 0.4981
qplot(er$Overlap_Counts, geom = "blank") +
geom_histogram(fill = "lemonchiffon4", bins = 8, color = "black") +
geom_vline(xintercept = length(overlap(toy, c(6, 7))), color = "firebrick2",
size = 2, linetype = "dashed", alpha = 0.7) +
ggtitle("Null Distribution") +
theme(plot.title = element_text(hjust = 0.5)) +
scale_x_continuous(name = "Overlap Counts") +
scale_y_continuous(name = "Frequency")
The test above, of course, is not very meaningful as we randomly created the sets; therefore, we get a high p-value. However, when you are working with actual data, e.g. to check if a motif is enriched in the promoter regions of the genes in a gene set, you can use this test. In that case, set1
will be the gene set of interest, set2
will be the all the genes that the motif is found in the genome and univ
will be all the genes of a genome.