Memory usage can be a big obstacle in the use of R/qtl2, particularly regarding the QTL genotype probabilities calculated by calc_genoprob()
. For dense markers in multi-parent populations, these can use gigabytes of RAM.
This led us to develop ways to store the genotype probabilities on disk. In the present package, we rely on the fst package, which includes the option to compress the data.
Let’s first load the R/qtl2 and R/qtl2fst packages.
library(qtl2)
library(qtl2fst)
In this vignette, we’ll give a quick illustration of the R/qtl2fst package using the iron dataset included with R/qtl2. We’ll first load the data.
<- read_cross2(system.file("extdata", "iron.zip", package="qtl2")) iron
Let’s calculate the genotype probabilities and convert them to allele probabilities.
<- calc_genoprob(iron, error_prob=0.002)
pr <- genoprob_to_alleleprob(pr) apr
Use the function fst_genoprob()
to write the probabilities to a fst database. You could do the same thing with the allele probabilities.
<- file.path(tempdir(), "iron_genoprob")
tmpdir dir.create(tmpdir)
<- fst_genoprob(pr, "pr", tmpdir, quiet=TRUE)
fpr <- fst_genoprob(apr, "apr", tmpdir, quiet=TRUE) fapr
The genotype probabilities are saved in a set of files, one per chromosome. There is also an RDS index file, which is a copy of the index object returned by fst_genoprob()
.
list.files(tmpdir)
## [1] "apr_1.fst" "apr_10.fst" "apr_11.fst" "apr_12.fst"
## [5] "apr_13.fst" "apr_14.fst" "apr_15.fst" "apr_16.fst"
## [9] "apr_17.fst" "apr_18.fst" "apr_19.fst" "apr_2.fst"
## [13] "apr_3.fst" "apr_4.fst" "apr_5.fst" "apr_6.fst"
## [17] "apr_7.fst" "apr_8.fst" "apr_9.fst" "apr_X.fst"
## [21] "apr_fstindex.rds" "pr_1.fst" "pr_10.fst" "pr_11.fst"
## [25] "pr_12.fst" "pr_13.fst" "pr_14.fst" "pr_15.fst"
## [29] "pr_16.fst" "pr_17.fst" "pr_18.fst" "pr_19.fst"
## [33] "pr_2.fst" "pr_3.fst" "pr_4.fst" "pr_5.fst"
## [37] "pr_6.fst" "pr_7.fst" "pr_8.fst" "pr_9.fst"
## [41] "pr_X.fst" "pr_fstindex.rds"
You can treat the fpr
and fapr
objects as if they were the genotype probabilities themselves. For example, use names()
to get the chromosome names.
names(fpr)
## [1] "1" "2" "3" "4" "5" "6" "7" "8" "9" "10" "11" "12" "13" "14" "15"
## [16] "16" "17" "18" "19" "X"
If you selecting a chromosome, it will be read from the fst database and into an array.
<- fapr[["X"]]
apr_X dim(apr_X)
## [1] 284 2 2
You can also use the $
operator.
<- fapr$X
apr_X dim(apr_X)
## [1] 284 2 2
You can subset by individuals, chromosome, and markers, with subset(object,ind,chr,mar)
or [ind,chr,mar]
. Just the selected portion will be read, and the fst database will not be altered.
<- subset(fapr, ind=1:20, chr=c("2","3"))
selected_ind dim(fapr)
## 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19
## ind 284 284 284 284 284 284 284 284 284 284 284 284 284 284 284 284 284 284 284
## gen 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
## mar 3 5 2 2 2 2 7 8 5 2 7 2 2 2 2 5 2 2 2
## X
## ind 284
## gen 2
## mar 2
You can also subset with brackets in various ways.
<- fapr[1:20, c("2","3")][["3"]]
fapr_sub1 <- fapr[,"2"]
fapr_sub2 <- fapr[,c("2","3")]
fapr_sub23 <- fapr[,"X"] fapr_subX
You can use a third dimension for markers, but be careful that if you select a subset of markers that excludes one or more chromosomes, those will be dropped.
dim(subset(fapr, mar=1:30))
## 1 2 3 4 5 6 7 8
## ind 284 284 284 284 284 284 284 284
## gen 2 2 2 2 2 2 2 2
## mar 3 5 2 2 2 2 7 7
dim(fapr[ , , dimnames(fapr)$mar$X[1:2]])
## X
## ind 284
## gen 2
## mar 2
Binding by columns (chromosomes) or rows (individuals) may cause creation of a new fst database if input objects arose from different fst databases. However, if objects are subsets of the same "fst_genoprob"
object, then it reuses the one fst database. Further, if objects have the same directory and file basename for their fst databases, they will be combined without creation of any new fst databases.
See example(cbind.fst_genoprob)
and example(rbind.fst_genoprob)
with objects having distinct fst databases.
Here’s column bind (chromosomes).
<- cbind(fapr_sub2,fapr_sub23) fapr_sub223
And here’s row bind (individuals)..
<- fapr[1:20, c("2","3")]
f23a <- fapr[40:79, c("2","3")]
f23b <- rbind(f23a, f23b) f23
Subset on markers. This way only extracts the selected markers
from the fst database before creating the array.
<- dimnames(fapr$X)[[3]][1:2]
markers dim(fapr[,,markers]$X)
## [1] 284 2 2
This way extracts all markers on X
, creates the array, then subsets on selected markers
.
<- dimnames(fapr$X)[[3]]
markers dim(fapr$X[,,markers[1:2]])
## [1] 284 2 2
Two "fst_genoprob"
objects using the same database. Combine using cbind
. Notice that the order of chromosomes is reversed by joining fapr2
to fapr3
. Be sure to not overwrite existing fst databases!
<- fst_genoprob(subset(apr, chr="2"), "aprx", tmpdir, quiet=TRUE)
fapr2 <- fst_genoprob(subset(apr, chr="3"), "aprx", tmpdir, quiet=TRUE)
fapr3 <- cbind(fapr3,fapr2)
fapr32 dim(fapr32)
## 3 2
## ind 284 284
## gen 2 2
## mar 2 5
list.files(tmpdir)
## [1] "apr_1.fst" "apr_10.fst" "apr_11.fst"
## [4] "apr_12.fst" "apr_13.fst" "apr_14.fst"
## [7] "apr_15.fst" "apr_16.fst" "apr_17.fst"
## [10] "apr_18.fst" "apr_19.fst" "apr_2.fst"
## [13] "apr_3.fst" "apr_4.fst" "apr_5.fst"
## [16] "apr_6.fst" "apr_7.fst" "apr_8.fst"
## [19] "apr_9.fst" "apr_X.fst" "apr_fstindex.rds"
## [22] "aprx_2.fst" "aprx_3.fst" "aprx_fstindex.rds"
## [25] "pr_1.fst" "pr_10.fst" "pr_11.fst"
## [28] "pr_12.fst" "pr_13.fst" "pr_14.fst"
## [31] "pr_15.fst" "pr_16.fst" "pr_17.fst"
## [34] "pr_18.fst" "pr_19.fst" "pr_2.fst"
## [37] "pr_3.fst" "pr_4.fst" "pr_5.fst"
## [40] "pr_6.fst" "pr_7.fst" "pr_8.fst"
## [43] "pr_9.fst" "pr_X.fst" "pr_fstindex.rds"
Let’s look under the hood at an "fst_genoprob"
object. Here are the names of elements it contains:
names(unclass(fapr))
## [1] "dim" "dimnames" "is_x_chr" "chr" "ind" "mar" "fst"
unclass(fapr)$fst
## [1] "/tmp/Rtmpr7kpsI/iron_genoprob/apr"
sapply(unclass(fapr)[c("ind","chr","mar")], length)
## ind chr mar
## 284 20 66
An "fst_genoprob"
object has all the original information. Thus, it is possible to restore the original object from a subset
(but not necessarily from a cbind
or rbind
). Here is an example.
<- subset(fapr, chr=c("2","3"))
fapr23 dim(fapr23)
## 2 3
## ind 284 284
## gen 2 2
## mar 5 2
dim(fst_restore(fapr23))
## 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19
## ind 284 284 284 284 284 284 284 284 284 284 284 284 284 284 284 284 284 284 284
## gen 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
## mar 3 5 2 2 2 2 7 8 5 2 7 2 2 2 2 5 2 2 2
## X
## ind 284
## gen 2
## mar 2
Use fst_path()
to determine the path to the fst database.
fst_path(fpr)
## [1] "/tmp/Rtmpr7kpsI/iron_genoprob/pr"
If you move the fst database, or if it’s using a relative path and you want to work with it from a different directory, use replace_path()
.
<- replace_path(fpr, tempdir()) fpr_newpath
Since the genotype probabilities can be really large, it’s very RAM intensive to calculate all of them and then create the database. Instead, you can use calc_genoprob_fst()
to run calc_genoprob()
and then fst_genoprob()
for one chromosome at a time.
<- calc_genoprob_fst(iron, "pr", tmpdir, error_prob=0.002, overwrite=TRUE) fpr
Similarly, genoprob_to_alleleprob_fst()
will run genoprob_to_alleleprob()
and then fst_genoprob()
for one chromosome at a time.
<- genoprob_to_alleleprob_fst(pr, "apr", tmpdir, overwrite=TRUE) fapr
You can use the fst_genoprob()
object in place of the genotype probabilities, in genome scans with scan1()
.
<- get_x_covar(iron)
Xcovar <- scan1(fpr, iron$pheno, Xcovar=Xcovar)
scan_pr find_peaks(scan_pr, iron$pmap, threshold=4)
## lodindex lodcolumn chr pos lod
## 1 1 liver 2 122.81416 4.957564
## 2 1 liver 16 53.96263 7.282938
## 3 2 spleen 8 32.01662 4.302919
## 4 2 spleen 9 101.52887 10.379771
Similarly for calculating QTL coefficients with scan1coef()
or scan1blup()`:
<- scan1coef(fpr[,"16"], iron$pheno[,1])
coef16 <- scan1blup(fpr[,"16"], iron$pheno[,1]) blup16