library(epitopR)
library(tidyverse)
#> Warning: package 'tidyverse' was built under R version 4.1.2
#> Warning: package 'ggplot2' was built under R version 4.1.2
#> Warning: package 'tibble' was built under R version 4.1.2
#> Warning: package 'tidyr' was built under R version 4.1.2
#> Warning: package 'readr' was built under R version 4.1.2
#> Warning: package 'dplyr' was built under R version 4.1.2
#> Warning: package 'stringr' was built under R version 4.1.2
library(devtools)
#> Warning: package 'devtools' was built under R version 4.1.2
#> Warning: package 'usethis' was built under R version 4.1.2
library(ggseqlogo)
library(here)
library(fs)
library(Biostrings)
#> Warning: package 'S4Vectors' was built under R version 4.1.1
#> Warning: package 'GenomeInfoDb' was built under R version 4.1.1
The mhcII_hu function requires several inputs.
Allele inputs: The presenting antigen, which is the self/recipient MHC II allele, is input as “ag_present.” The stimulating antigen, which is the foreign/donor MHC molecule, is input as “ag_stim.” Finally, self antigens from the homologous loci to the stimulating antigens are input as “ag_self.” Each of these inputs can either be an individual allele, or a set of alleles. The sequences of each allele are pulled from the IPD-IMGT/HLA Database and can be viewed here ftp site. Alternatively, if the advanced user wishes to generate their own reference sequences, this can be manipulated by running the ref.R script.
The “seq_len” parameter refers to the length of peptide to generate. For class II peptide presentation, we recommend a sequence length of 15. If the user wishes to consider peptides of varying lengths, they can input a string of multiple lengths separated by commas.
The “cutoff” parameter sets the threshold for strong and weak binders. Multiple prediction methods are used, each of which provide different outputs (i.e. IC50, “strength”, “score”). Our justification for the default thresholds is below, however the user may choose to specify alternate cutoffs is desired.
# input antigen name
<- mhcII_hu(ag_present = c("DRB1_08_01"),
out_named_ag ag_stim = c("DQA1_01_01","DQA1_04_01"),
ag_self = c("DQA1_02_01"),
seq_len = 15,
method = "net")
Alternatively protein sequences for stimulating and self antigens can be input manually as stored variables. *Note this is not an option for the presenting allele “ag_present” which must be entered as a character string
# input protein sequence
<- "MILNKALLLGALALTTVMSPCGGEDIVADHVASCGVNLYQFYGPSGQYTHEFDGDEEFYVDLERKETAWRWPEFSKFGGFDPQGALRNMAVAKHNLNIMIKRYNSTAATNEVPEVTVFSKSPVTLGQPNTLICLVDNIFPPVVNITWLSNGQSVTEGVSETSFLSKSDHSFFKISYLTFLPSADEIYDCKVEHWGLDQPLLKHWEPEIPAPMSELTETVVCALGLSVGLVGIVVGTVFIIQGLRSVGASRHQGPL"
dqa0101
<- "MILNKALMLGALALTTVMSPCGGEDIVADHVASYGVNLYQSYGPSGQFTHEFDGDEEFYVDLERKETVWKLPLFHRLRFDPQFALTNIAVLKHNLNILIKRSNSTAATNEVPEVTVFSKSPVTLGQPNTLICLVDNIFPPVVNITWLSNGHSVTEGVSETSFLSKSDHSFFKISYLTFLPSADEIYDCKVEHWGLDEPLLKHWEPEIPAPMSELTETVVCALGLSVGLVGIVVGTVLIIRGLRSVGASRHQGPL"
dqa0202
<- "MILNKALLLGALALTTVMSPCGGEDIVADHVASYGVNLYQSYGPSGQYTHEFDGDEQFYVDLGRKETVWCLPVLRQFRFDPQFALTNIAVTKHNLNILIKRSNSTAATNEVPEVTVFSKSPVTLGQPNTLICLVDNIFPPVVNITWLSNGHSVTEGVSETSFLSKSDHSFFKISYLTFLPSADEIYDCKVEHWGLDEPLLKHWEPEIPAPMSELTETVVCALGLSVGLVGIVVGTVFIIRGLRSVGASRHQGPL"
dqa0401
<- mhcII_hu(ag_present = c("DRB1_08_01"),
out_stored_ag ag_stim = c(dqa0101, dqa0401),
ag_self = c(dqa0202),
seq_len = 15,
method = "net")
# run prediction of dqa0202 and dqa0401
<- mhcII_hu(ag_present = c("DRB1_08_01"),
out_single_ag ag_stim = c(dqa0101),
ag_self = c(dqa0202),
seq_len = 15,
method = "net")
<- "MILNKALLLGALALTTVMSPCGGEDIVADHVASCGVNLYQFYGPSGQYTHEFDGDEEFYVDLERKETAWRWPEFSKFGGFDPQGALRNMAVAKHNLNIMIKRYNSTAATNEVPEVTVFSKSPVTLGQPNTLICLVDNIFPPVVNITWLSNGQSVTEGVSETSFLSKSDHSFFKISYLTFLPSADEIYDCKVEHWGLDQPLLKHWEPEIPAPMSELTETVVCALGLSVGLVGIVVGTVFIIQGLRSVGASRHQGPL"
dqa0101_algn
<- "MILNKALMLGALALTTVMSPCGGEDIVADHVASYGVNLYQSYGPSGQFTHEFDGDEEFYVDLERKETVWKLPLFHRLR-FDPQFALTNIAVLKHNLNILIKRSNSTAATNEVPEVTVFSKSPVTLGQPNTLICLVDNIFPPVVNITWLSNGHSVTEGVSETSFLSKSDHSFFKISYLTFLPSADEIYDCKVEHWGLDEPLLKHWEPEIPAPMSELTETVVCALGLSVGLVGIVVGTVLIIRGLRSVGASRHQGPL"
dqa0202_algn
<- core_mut(out_single_ag, ag_stim=dqa0101_algn, ag_self=dqa0202_algn)
core_mut_result
<- core_mut_result %>%
core_mut_result filter(core_mut=="yes") %>%
select(-c(core_mut))
%>%
out_stored_ag mutate(antigen=as.factor(antigen)) %>%
ggplot(aes(antigen, rank_val, color=as.factor(strength_rank))) +
geom_jitter(width=0.2) +
xlab("Stimulating Antigen") +
ylab("Adjusted Rank (Percentile)") +
labs(color = "Predicted Binding") +
theme_light()
# Plot of rank x IC50
%>%
out_stored_ag ggplot(aes(score_val, rank_val, color=as.factor(antigen), group=as.factor(antigen))) +
geom_point() +
facet_wrap(~as.factor(antigen)) +
labs(color="Antigen")
<- mhcII_hu(ag_present = c("DRB1_08_01"),
out_stored_ag ag_stim = c(dqa0101, dqa0401),
ag_self = c(dqa0202),
seq_len = 15,
method = "net",
cutoff_score = list(cutoff_netpan = c(50, 200),
cutoff_comblib = c(50, 200),
cutoff_nn_align = c(50, 200),
cutoff_sturniolo = c(2)),
cutoff_rank= c(2, 10))
# Plot of rank x IC50
%>%
out_stored_ag ggplot(aes(score_val, rank_val, color=as.factor(antigen), group=as.factor(antigen))) +
geom_point() +
facet_wrap(~as.factor(antigen)) +
labs(color="Antigen")
<- out_named_ag %>%
dqa_01_peps filter(antigen=="DQA1_01_01") %>%
filter(strength_rank%in% c("weak", "strong"))
ggseqlogo(dqa_01_peps$pep_stim)
Initial thresholds for IC50 thresholds were determined by Sette in 1998, with IC50<100 is strong, 100-1000 is weak (1). More recently, IEDB guidelines recommend using IC50 <50 and and 50-500 for strong and intermediate affinity groups. “As a rough guideline, peptides with IC50values <50 nM are considered high affinity, <500 nM intermediate affinity and <5000 nM low affinity. Most known epitopes have high or intermediate affinity. Some epitopes have low affinity, but no known T-cell epitope has an IC50 value greater than 5000.” (2) Thresholds for IC50 can be used for most methods other than netmhcpan_el and Sturniolo. Sturniolo scoring positively correlates with binding affinity, therefore a higher score is preferred. In Hammer 1994, the authors concluded that the majority of peptides with strong binding affinity had a score >= 2 (3). There is no additional threshold for weak binding affinity, so only one threshold is used for this scoring system. For netmhcpan_ba and netmhcpan_el, the provided output “percent rank” is a transformation that normalizes prediction scores by comparison to the predicted scores of random natural peptides (4). Since netmhcpan_el is based off of studies of experimentally eluted peptides rather than studies of binding affinity, it can not be thresholded by IC50 as other scores. Rather, the provided “score” is the likelihood of the peptide being an eluted ligand - for example, a score of 0.66 means a likelihood of 66% that the peptide would be a ligand. For this method of peptide scoring, we have chosen to use the adjusted percent rank to threshold, with cutoffs of 2% and 10% as recommended (4).
Consensus:
NN-align:
Jensen KK, Andreatta M, Marcatili P, Buus S, Greenbaum JA, Yan Z, Sette A, Peters B, Nielsen M. 2018. NN-align. Improved methods for predicting peptide binding affinity to MHC class II molecules. Immunology 154(3):394-406.
Nielsen M, Lund O. 2009. NN-align. An artificial neural network-based alignment algorithm for MHC class II peptide binding prediction. BMC Bioinformatics. 10:296.
SMM-align:
Combinatorial library:
Sturniolo:
NetMHCIIpan:
Reynisson B., Alvarez B., Paul S., Peters B., Nielsen M. 2020. NetMHCpan-4.1 and NetMHCIIpan-4.0: improved predictions of MHC antigen presentation by concurrent motif deconvolution and integration of MS MHC eluted ligand data. Nucleic Acids Res. 48(W1):W449-W454.
Jensen KK, Andreatta M, Marcatili P, Buus S, Greenbaum JA, Yan Z, Sette A, Peters B, Nielsen M. 2018. Improved methods for predicting peptide binding affinity to MHC class II molecules. Immunology 154(3):394-406.
Andreatta M, Karosiene E, Rasmussen M, Stryhn A, Buus S, and Nielsen M. 2015. Accurate pan-specific prediction of peptide-MHC class II binding affinity with improved binding core identification. Immunogenetics.67(11-12):641-50. PMID: 26416257 Download PDF