library(RScelestial)
# We load igraph for drawing trees. If you do not want to draw,
# there is no need to import igraph.
library(igraph)
#>
#> Attaching package: 'igraph'
#> The following objects are masked from 'package:stats':
#>
#> decompose, spectrum
#> The following object is masked from 'package:base':
#>
#> union
The RScelestial package could be installed easily as follows
install.packages("RScelestial")
Here we show a simulation. We build a data set with following command.
# Following command generates ten samples with 20 loci.
# Rate of mutations on each edge of the evolutionary tree is 1.5.
= synthesis(10, 20, 5, seed = 7)
D
D#> $seqeunce
#> C1 C2 C3 C4 C5 C6 C7 C8 C9 C10
#> L1 3 1 3 0 0 1 3 1 3 3
#> L2 1 1 3 0 0 0 3 3 0 3
#> L3 1 0 3 0 3 3 0 3 3 0
#> L4 1 3 3 3 3 3 0 1 0 3
#> L5 1 0 1 0 3 0 3 0 0 1
#> L6 0 3 3 3 3 3 3 3 3 3
#> L7 0 3 0 3 0 3 3 1 0 3
#> L8 3 3 0 1 1 3 3 3 3 1
#> L9 1 0 3 0 3 1 3 3 0 1
#> L10 3 3 0 0 0 1 3 1 0 0
#> L11 3 0 3 3 0 3 0 3 3 0
#> L12 1 0 0 3 3 0 0 0 3 1
#> L13 3 3 1 3 0 3 0 0 1 3
#> L14 0 3 3 3 3 3 3 3 0 3
#> L15 3 1 0 1 0 0 0 0 0 0
#> L16 3 0 0 0 3 3 0 0 1 0
#> L17 0 3 3 3 3 0 3 3 0 3
#> L18 3 3 3 3 3 3 1 3 3 3
#> L19 1 0 3 1 0 3 3 0 3 3
#> L20 0 0 3 0 3 3 3 3 0 0
#>
#> $true.sequence
#> C1 C2 C3 C4 C5 C6 C7 C8 C9 C10
#> L1 0 0 0 0 0 0 0 0 0 0
#> L2 0 0 0 0 0 0 0 0 0 0
#> L3 0 0 0 0 0 0 0 0 0 0
#> L4 0 0 0 0 0 0 0 0 0 0
#> L5 1 0 1 0 0 0 0 0 0 1
#> L6 0 0 0 0 0 0 0 0 0 0
#> L7 0 0 0 0 0 0 0 0 0 0
#> L8 0 0 0 0 0 0 0 0 1 0
#> L9 1 0 1 0 0 0 0 0 0 1
#> L10 0 0 0 0 0 0 0 0 0 0
#> L11 0 0 0 0 0 0 0 0 0 0
#> L12 0 0 0 0 0 0 0 0 0 0
#> L13 0 0 0 0 0 0 0 0 1 0
#> L14 0 0 0 0 0 0 0 0 1 0
#> L15 0 0 0 0 0 0 0 0 0 0
#> L16 0 0 0 0 0 0 0 0 1 0
#> L17 0 0 0 0 0 0 0 0 0 0
#> L18 0 0 0 0 0 0 0 0 0 0
#> L19 0 0 0 0 0 0 0 0 0 0
#> L20 0 0 0 0 0 0 0 0 0 0
#>
#> $true.clone
#> $true.clone$N1
#> [1] "C6" "C7"
#>
#> $true.clone$N2
#> [1] "C2" "C5" "C8"
#>
#> $true.clone$N3
#> [1] "C4"
#>
#> $true.clone$N5
#> [1] "C1" "C3" "C10"
#>
#> $true.clone$N6
#> [1] "C9"
#>
#>
#> $true.tree
#> src dest len
#> 1 N2 N1 1
#> 2 N3 N1 1
#> 3 N5 N1 1
#> 4 N6 N1 2
= as.ten.state.matrix(D$seqeunce)
seq = scelestial(seq, return.graph = TRUE)
SP
SP#> $input
#> V1 V2 V3 V4 V5 V6 V7 V8 V9 V10 V11 V12 V13 V14 V15 V16 V17 V18 V19
#> C1 ./. C/C C/C C/C C/C A/A A/A ./. C/C ./. ./. C/C ./. A/A ./. ./. A/A ./. C/C
#> C10 ./. ./. A/A ./. C/C ./. ./. C/C C/C A/A A/A C/C ./. ./. A/A A/A ./. ./. ./.
#> C2 C/C C/C A/A ./. A/A ./. ./. ./. A/A ./. A/A A/A ./. ./. C/C A/A ./. ./. A/A
#> C3 ./. ./. ./. ./. C/C ./. A/A A/A ./. A/A ./. A/A C/C ./. A/A A/A ./. ./. ./.
#> C4 A/A A/A A/A ./. A/A ./. ./. C/C A/A A/A ./. ./. ./. ./. C/C A/A ./. ./. C/C
#> C5 A/A A/A ./. ./. ./. ./. A/A C/C ./. A/A A/A ./. A/A ./. A/A ./. ./. ./. A/A
#> C6 C/C A/A ./. ./. A/A ./. ./. ./. C/C C/C ./. A/A ./. ./. A/A ./. A/A ./. ./.
#> C7 ./. ./. A/A A/A ./. ./. ./. ./. ./. ./. A/A A/A A/A ./. A/A A/A ./. C/C ./.
#> C8 C/C ./. ./. C/C A/A ./. C/C ./. ./. C/C ./. A/A A/A ./. A/A A/A ./. ./. A/A
#> C9 ./. A/A ./. A/A A/A ./. A/A ./. A/A A/A ./. ./. C/C A/A A/A C/C A/A ./. ./.
#> V20
#> C1 A/A
#> C10 A/A
#> C2 A/A
#> C3 ./.
#> C4 A/A
#> C5 ./.
#> C6 ./.
#> C7 ./.
#> C8 ./.
#> C9 A/A
#>
#> $sequence
#> V1 V2 V3 V4 V5 V6 V7 V8 V9 V10 V11 V12 V13 V14 V15 V16 V17 V18 V19
#> C1 A/A C/C C/C C/C C/C A/A A/A C/C C/C A/A A/A C/C A/A A/A A/A A/A A/A A/A C/C
#> C10 A/A A/A A/A A/A C/C A/A A/A C/C C/C A/A A/A C/C A/A A/A A/A A/A A/A A/A A/A
#> C2 C/C C/C A/A C/C A/A A/A C/C C/C A/A A/A A/A A/A A/A A/A C/C A/A A/A A/A A/A
#> C3 A/A A/A A/A A/A C/C A/A A/A A/A C/C A/A A/A A/A C/C A/A A/A A/A A/A A/A A/A
#> C4 A/A A/A A/A A/A A/A A/A A/A C/C A/A A/A A/A A/A C/C A/A C/C A/A A/A A/A C/C
#> C5 A/A A/A A/A A/A C/C A/A A/A C/C C/C A/A A/A C/C A/A A/A A/A A/A A/A A/A A/A
#> C6 C/C A/A A/A C/C A/A A/A C/C A/A C/C C/C A/A A/A A/A A/A A/A A/A A/A A/A A/A
#> C7 C/C A/A A/A A/A A/A A/A C/C C/C C/C C/C A/A A/A A/A A/A A/A A/A A/A C/C A/A
#> C8 C/C A/A A/A C/C A/A A/A C/C A/A A/A C/C A/A A/A A/A A/A A/A A/A A/A A/A A/A
#> C9 A/A A/A A/A A/A A/A A/A A/A A/A A/A A/A A/A A/A C/C A/A A/A C/C A/A A/A A/A
#> V20
#> C1 A/A
#> C10 A/A
#> C2 A/A
#> C3 A/A
#> C4 A/A
#> C5 A/A
#> C6 A/A
#> C7 A/A
#> C8 A/A
#> C9 A/A
#>
#> $tree
#> src dest len
#> 1 C6 C8 5.75015
#> 2 C2 C4 6.75011
#> 3 C2 C8 6.75014
#> 4 C7 C8 6.75015
#> 5 C7 C10 6.75015
#> 6 C5 C10 6.75016
#> 7 C3 C10 7.00014
#> 8 C4 C9 7.50013
#> 9 C1 C10 7.75015
#>
#> $graph
#> IGRAPH 4f0a8e2 UNW- 10 9 --
#> + attr: name (v/c), weight (e/n)
#> + edges from 4f0a8e2 (vertex names):
#> [1] C6 --C8 C2 --C4 C8 --C2 C8 --C7 C7 --C10 C10--C5 C10--C3 C4 --C9
#> [9] C10--C1
You can draw the graph with following command
tree.plot(SP, vertex.size = 30)
Also, we can make a rooted tree with cell “C8” as the root of the tree as follows:
= scelestial(seq, root.assign.method = "fix", root = "C8", return.graph = TRUE)
SP #> [1] "C8 -1"
#> [1] "C6 C8"
#> [1] "C2 C8"
#> [1] "C4 C2"
#> [1] "C9 C4"
#> [1] "C7 C8"
#> [1] "C10 C7"
#> [1] "C5 C10"
#> [1] "C3 C10"
#> [1] "C1 C10"
tree.plot(SP, vertex.size = 30)
Setting root.assign.method to “balance” lets the algorithm decide for a root that produces minimum height tree.
= scelestial(seq, root.assign.method = "balance", return.graph = TRUE)
SP #> [1] "C1 -1"
#> [1] "C10 C1"
#> [1] "C7 C10"
#> [1] "C8 C7"
#> [1] "C6 C8"
#> [1] "C2 C8"
#> [1] "C4 C2"
#> [1] "C9 C4"
#> [1] "C5 C10"
#> [1] "C3 C10"
#> [1] "C8 -1"
#> [1] "C6 C8"
#> [1] "C2 C8"
#> [1] "C4 C2"
#> [1] "C9 C4"
#> [1] "C7 C8"
#> [1] "C10 C7"
#> [1] "C5 C10"
#> [1] "C3 C10"
#> [1] "C1 C10"
tree.plot(SP, vertex.size = 30)
Following command calculates the distance array between pairs of samples.
<- distance.matrix.true.tree(D)
D.distance.matrix
D.distance.matrix#> C6 C7 C2 C5 C8 C4
#> C6 0.000000000 0.000000000 0.007246377 0.007246377 0.007246377 0.007246377
#> C7 0.000000000 0.000000000 0.007246377 0.007246377 0.007246377 0.007246377
#> C2 0.007246377 0.007246377 0.000000000 0.000000000 0.000000000 0.014492754
#> C5 0.007246377 0.007246377 0.000000000 0.000000000 0.000000000 0.014492754
#> C8 0.007246377 0.007246377 0.000000000 0.000000000 0.000000000 0.014492754
#> C4 0.007246377 0.007246377 0.014492754 0.014492754 0.014492754 0.000000000
#> C1 0.007246377 0.007246377 0.014492754 0.014492754 0.014492754 0.014492754
#> C3 0.007246377 0.007246377 0.014492754 0.014492754 0.014492754 0.014492754
#> C10 0.007246377 0.007246377 0.014492754 0.014492754 0.014492754 0.014492754
#> C9 0.014492754 0.014492754 0.021739130 0.021739130 0.021739130 0.021739130
#> C1 C3 C10 C9
#> C6 0.007246377 0.007246377 0.007246377 0.01449275
#> C7 0.007246377 0.007246377 0.007246377 0.01449275
#> C2 0.014492754 0.014492754 0.014492754 0.02173913
#> C5 0.014492754 0.014492754 0.014492754 0.02173913
#> C8 0.014492754 0.014492754 0.014492754 0.02173913
#> C4 0.014492754 0.014492754 0.014492754 0.02173913
#> C1 0.000000000 0.000000000 0.000000000 0.02173913
#> C3 0.000000000 0.000000000 0.000000000 0.02173913
#> C10 0.000000000 0.000000000 0.000000000 0.02173913
#> C9 0.021739130 0.021739130 0.021739130 0.00000000
<- distance.matrix.scelestial(SP)
SP.distance.matrix
SP.distance.matrix#> C1 C10 C2 C3 C4 C5
#> C1 0.000000000 0.004338085 0.015673107 0.008256357 0.019451428 0.008116433
#> C10 0.004338085 0.000000000 0.011335023 0.003918273 0.015113343 0.003778348
#> C2 0.015673107 0.011335023 0.000000000 0.015253295 0.003778320 0.015113371
#> C3 0.008256357 0.003918273 0.015253295 0.000000000 0.019031616 0.007696621
#> C4 0.019451428 0.015113343 0.003778320 0.019031616 0.000000000 0.018891691
#> C5 0.008116433 0.003778348 0.015113371 0.007696621 0.018891691 0.000000000
#> C6 0.015113371 0.010775286 0.006996938 0.014693559 0.010775258 0.014553634
#> C7 0.008116428 0.003778343 0.007556680 0.007696615 0.011335000 0.007556691
#> C8 0.011894770 0.007556685 0.003778337 0.011474958 0.007556657 0.011335034
#> C9 0.023649566 0.019311481 0.007976458 0.023229754 0.004198138 0.023089829
#> C6 C7 C8 C9
#> C1 0.015113371 0.008116428 0.011894770 0.023649566
#> C10 0.010775286 0.003778343 0.007556685 0.019311481
#> C2 0.006996938 0.007556680 0.003778337 0.007976458
#> C3 0.014693559 0.007696615 0.011474958 0.023229754
#> C4 0.010775258 0.011335000 0.007556657 0.004198138
#> C5 0.014553634 0.007556691 0.011335034 0.023089829
#> C6 0.000000000 0.006996943 0.003218601 0.014973396
#> C7 0.006996943 0.000000000 0.003778343 0.015533138
#> C8 0.003218601 0.003778343 0.000000000 0.011754796
#> C9 0.014973396 0.015533138 0.011754796 0.000000000
## Difference between normalized distance matrices
<- rownames(SP.distance.matrix)
vertices sum(abs(D.distance.matrix[vertices,vertices] - SP.distance.matrix))
#> [1] 0.4487036
Given a multiple sequence alignment, Scelestial infers the phylogeny of them. Here we present a simple example. First we load libraries to load a multiple alignment.
library(stringr)
library(seqinr)
In this example, we load a multiple alignment from seqinr package.
data(phylip, package = "seqinr")
Then we clean the data and build a zero-one matrix representing taxa and characters. Note that Scelestial accept matrices with taxa as its columns and characters as its rows.
# Removing non-informative columns and duplicate rows.
<- toupper(t(sapply(seq(phylip$seq), function(i) unlist(strsplit(phylip$seq[[i]], '')))))
mcb <- as.character(phylip$seq)
ccb <- order(ccb)
occb <- sapply(seq(ncol(mcb)), function(j) length(levels(as.factor(mcb[,j]))) == 1)
cbColMask <- rep(TRUE, length(ccb))
cbRowMask for (i in seq(length(ccb))) {
if (i == 1 || ccb[occb[i]] != ccb[occb[i-1]]) {
<- FALSE
cbRowMask[occb[i]]
}
}<- apply(mcb[!cbRowMask, !cbColMask], MARGIN = 1, FUN = function(a) paste0(str_replace(a, "-", "X"), collapse = "")) mcbRows
Executing Scelestial on the input matrix.
<- data.frame(nodes = phylip$nam[!cbRowMask], seq = mcbRows)
n.seq <- data.frame(t(as.ten.state.matrix.from.node.seq(n.seq)), stringsAsFactors = TRUE)
seq2 # Running Scelestial
= scelestial(seq2, return.graph = TRUE) SP
tree.plot(SP, vertex.size=20, vertex.label.dist=0, asp = 0, vertex.label.cex = 1)