From 9a9e6c2ccb4bb87eefa2df477e901ebc86fe13c4 Mon Sep 17 00:00:00 2001 From: stevemussmann Date: Thu, 9 Apr 2026 11:35:12 -0700 Subject: [PATCH 1/2] fix outlier detection bug --- R/combine_bgc_output.R | 336 +++++++++++++++--------------- R/get_bgc_outliers.R | 451 +++++++++++++++++++++-------------------- R/plot_traces.R | 12 ++ scripts/run_bgc.sh | 8 + 4 files changed, 422 insertions(+), 385 deletions(-) mode change 100644 => 100755 scripts/run_bgc.sh diff --git a/R/combine_bgc_output.R b/R/combine_bgc_output.R index 118628b..9e03505 100644 --- a/R/combine_bgc_output.R +++ b/R/combine_bgc_output.R @@ -1,161 +1,175 @@ -#' Aggregate BGC Output -#' -#' This function aggregates multiple BGC runs into one file. Runs are joined -#' by columns and must all contain the same number of post-burnin MCMC -#' iterations. It assumes that input files are contained in a single directory -#' and have specific file suffixes. See the README.md file for details on -#' file suffixes. Files can have population prefixes. -#' -#' @param results.dir Path to a directory containing all the BGC results -#' @param prefix Prefix to the input files; e.g., "population1" -#' @param thin Thin to every n MCMC samples. E.g. thin=2 cuts samples in half -#' @param discard Discard first N samples from each BGC run -#' @return A list of data.frames (each data.frame is a BGC parameter) -#' @export -#' @examples -#' aggregate.results <- combine_bgc_output(results.dir = "./results", -#' prefix = "population1") -#' -#' aggregate.results <- combine_bgc_output(results.dir = "./results", -#' prefix = "population2", -#' thin = 2) -#' -#' aggregate.results <- combine_bgc_output(results.dir = "./results", -#' prefix = "pop3", -#' discard = 2000) -combine_bgc_output <- function(results.dir, - prefix, - thin = NULL, - discard = NULL){ - - writeLines(paste0("\n\nLoading input files with prefix ", prefix, "...\n")) - - lnl <- list.files(path = results.dir, - pattern = paste0(prefix, ".*lnl_\\d+$"), - full.names = TRUE) - check_if_files(lnl) - - a0 <- list.files(path = results.dir, - pattern = paste0(prefix, ".*a0_\\d+$"), - full.names = TRUE) - check_if_files(a0) - - b0 <- list.files(path = results.dir, - pattern = paste0(prefix, ".*b0_\\d+$"), - full.names = TRUE) - check_if_files(b0) - - qa <- list.files(path = results.dir, - pattern = paste0(prefix, ".*qa_\\d+$"), - full.names = TRUE) - check_if_files(qa) - - qb <- list.files(path = results.dir, - pattern = paste0(prefix, ".*qb_\\d+$"), - full.names = TRUE) - check_if_files(qb) - - hi <- list.files(path = results.dir, - pattern = paste0(prefix, ".*hi_\\d+$"), - full.names = TRUE) - check_if_files(hi) - - gc() - - if (!countBGCruns(list(lnl, a0, b0, qa, qb, hi))){ - stop("Error: All parameters must have the same number of BGC runs!") - } - - if (!is.null(thin)){ - writeLines(paste0("\n\nThinning MCMC samples with frequency: ", thin)) - } - if (!is.null(discard)){ - writeLines(paste0("\n\nDiscarding samples as burnin: ", discard)) - } - - # Aggregate the BGC runs into one data.frame - lnl.df <- bind_bgc_runs(lnl, thin = thin, discard = discard) - gc() - a0.df <- bind_bgc_runs(a0, thin = thin, discard = discard) - gc() - b0.df <- bind_bgc_runs(b0, thin = thin, discard = discard) - gc() - qa.df <- bind_bgc_runs(qa, thin = thin, discard = discard) - gc() - qb.df <- bind_bgc_runs(qb, thin = thin, discard = discard) - gc() - hi.df <- bind_bgc_runs(hi, thin = thin, discard = discard) - gc() - - writeLines(paste0("\n\n # MCMC samples = ", ncol(lnl.df))) - - -writeLines(paste0("\n\nDone! Found ", length(lnl), " BGC runs.\n\n")) - -df.list <- - list(lnl=lnl.df, a0=a0.df, b0=b0.df, qa=qa.df, qb=qb.df, hi=hi.df) - -return(df.list) -} - -#' Validates that input files were found -#' @param files A vector of file paths -#' @noRd -check_if_files <- function(files){ - # Make sure list.files found the correct input files. - if (length(files) == 0){ - stop(paste0("Error: The files ", - files, - " were not found in the input directory.")) - } -} - -#' Combine bgc runs together by binding columns (uses dplyr and data.table) -#' @param files A vector of file paths -#' @param thin Boolean. If TRUE, thins MCMC to every nth sample -#' @noRd -bind_bgc_runs <- function(files, thin = NULL, discard = NULL){ - - data_list <- lapply(files, data.table::fread, stringsAsFactors = FALSE) - - if (!is.null(thin)){ - data_list <- lapply(data_list, function(x) thin_MCMC(x, thin)) - } - if (!is.null(discard)){ - data_list <- lapply(data_list, function(x) discardNsamples(x, discard)) - } - - df <- dplyr::bind_cols(data_list) - - rm(data_list) - gc() - return(df) -} - -#' Validate that all parameters have same number of BGC runs -#' @param l List of filepath vectors -#' @noRd -countBGCruns <- function(l){ - return(length(unique(sapply(l, length))) == 1) -} - -#' Thin MCMC iterations every Nth iteration. -#' @param df data.frame with each MCMC sample as a column -#' @param n Only keep every nth iteration -#' @noRd -thin_MCMC <- function(df, n){ - df.new <- df[ ,seq(from = 1, to = ncol(df), by = n)] - rm(df) - gc() - return(df.new) -} - -#' Discard first N samples of each run -#' @param df data.frame containing -#' @noRd -discardNsamples <- function(df, n){ - df.new <- df[,(n+1):ncol(df)] - rm(df) - gc() - return(df.new) -} +#' Aggregate BGC Output +#' +#' This function aggregates multiple BGC runs into one file. Runs are joined +#' by columns and must all contain the same number of post-burnin MCMC +#' iterations. It assumes that input files are contained in a single directory +#' and have specific file suffixes. See the README.md file for details on +#' file suffixes. Files can have population prefixes. +#' +#' @param results.dir Path to a directory containing all the BGC results +#' @param prefix Prefix to the input files; e.g., "population1" +#' @param thin Thin to every n MCMC samples. E.g. thin=2 cuts samples in half +#' @param discard Discard first N samples from each BGC run +#' @return A list of data.frames (each data.frame is a BGC parameter) +#' @export +#' @examples +#' aggregate.results <- combine_bgc_output(results.dir = "./results", +#' prefix = "population1") +#' +#' aggregate.results <- combine_bgc_output(results.dir = "./results", +#' prefix = "population2", +#' thin = 2) +#' +#' aggregate.results <- combine_bgc_output(results.dir = "./results", +#' prefix = "pop3", +#' discard = 2000) +combine_bgc_output <- function(results.dir, + prefix, + thin = NULL, + discard = NULL){ + + writeLines(paste0("\n\nLoading input files with prefix ", prefix, "...\n")) + + lnl <- list.files(path = results.dir, + pattern = paste0(prefix, ".*lnl_\\d+$"), + full.names = TRUE) + check_if_files(lnl) + + a0 <- list.files(path = results.dir, + pattern = paste0(prefix, ".*a0_\\d+$"), + full.names = TRUE) + check_if_files(a0) + + b0 <- list.files(path = results.dir, + pattern = paste0(prefix, ".*b0_\\d+$"), + full.names = TRUE) + check_if_files(b0) + + qa <- list.files(path = results.dir, + pattern = paste0(prefix, ".*qa_\\d+$"), + full.names = TRUE) + check_if_files(qa) + + qb <- list.files(path = results.dir, + pattern = paste0(prefix, ".*qb_\\d+$"), + full.names = TRUE) + check_if_files(qb) + + hi <- list.files(path = results.dir, + pattern = paste0(prefix, ".*hi_\\d+$"), + full.names = TRUE) + check_if_files(hi) + + ta <- list.files(path = results.dir, + pattern = paste0(prefix, ".*ta_\\d+$"), + full.names = TRUE) + check_if_files(ta) + + tb <- list.files(path = results.dir, + pattern = paste0(prefix, ".*tb_\\d+$"), + full.names = TRUE) + check_if_files(tb) + + gc() + + if (!countBGCruns(list(lnl, a0, b0, qa, qb, hi, ta, tb))){ + stop("Error: All parameters must have the same number of BGC runs!") + } + + if (!is.null(thin)){ + writeLines(paste0("\n\nThinning MCMC samples with frequency: ", thin)) + } + if (!is.null(discard)){ + writeLines(paste0("\n\nDiscarding samples as burnin: ", discard)) + } + + # Aggregate the BGC runs into one data.frame + lnl.df <- bind_bgc_runs(lnl, thin = thin, discard = discard) + gc() + a0.df <- bind_bgc_runs(a0, thin = thin, discard = discard) + gc() + b0.df <- bind_bgc_runs(b0, thin = thin, discard = discard) + gc() + qa.df <- bind_bgc_runs(qa, thin = thin, discard = discard) + gc() + qb.df <- bind_bgc_runs(qb, thin = thin, discard = discard) + gc() + hi.df <- bind_bgc_runs(hi, thin = thin, discard = discard) + gc() + ta.df <- bind_bgc_runs(ta, thin = thin, discard = discard) + gc() + tb.df <- bind_bgc_runs(tb, thin = thin, discard = discard) + gc() + + writeLines(paste0("\n\n # MCMC samples = ", ncol(lnl.df))) + + +writeLines(paste0("\n\nDone! Found ", length(lnl), " BGC runs.\n\n")) + +df.list <- + list(lnl=lnl.df, a0=a0.df, b0=b0.df, qa=qa.df, qb=qb.df, hi=hi.df, ta=ta.df, tb=tb.df) + +return(df.list) +} + +#' Validates that input files were found +#' @param files A vector of file paths +#' @noRd +check_if_files <- function(files){ + # Make sure list.files found the correct input files. + if (length(files) == 0){ + stop(paste0("Error: The files ", + files, + " were not found in the input directory.")) + } +} + +#' Combine bgc runs together by binding columns (uses dplyr and data.table) +#' @param files A vector of file paths +#' @param thin Boolean. If TRUE, thins MCMC to every nth sample +#' @noRd +bind_bgc_runs <- function(files, thin = NULL, discard = NULL){ + + data_list <- lapply(files, data.table::fread, stringsAsFactors = FALSE) + + if (!is.null(thin)){ + data_list <- lapply(data_list, function(x) thin_MCMC(x, thin)) + } + if (!is.null(discard)){ + data_list <- lapply(data_list, function(x) discardNsamples(x, discard)) + } + + df <- dplyr::bind_cols(data_list) + + rm(data_list) + gc() + return(df) +} + +#' Validate that all parameters have same number of BGC runs +#' @param l List of filepath vectors +#' @noRd +countBGCruns <- function(l){ + return(length(unique(sapply(l, length))) == 1) +} + +#' Thin MCMC iterations every Nth iteration. +#' @param df data.frame with each MCMC sample as a column +#' @param n Only keep every nth iteration +#' @noRd +thin_MCMC <- function(df, n){ + df.new <- df[ ,seq(from = 1, to = ncol(df), by = n)] + rm(df) + gc() + return(df.new) +} + +#' Discard first N samples of each run +#' @param df data.frame containing +#' @noRd +discardNsamples <- function(df, n){ + df.new <- df[,(n+1):ncol(df)] + rm(df) + gc() + return(df.new) +} diff --git a/R/get_bgc_outliers.R b/R/get_bgc_outliers.R index 2c29957..c1e6801 100644 --- a/R/get_bgc_outliers.R +++ b/R/get_bgc_outliers.R @@ -1,224 +1,227 @@ -#' Identify Outlier BGC SNPs Using Two Criteria -#' -#' These functions identify alpha and beta BGC outliers using two criteria. -#' First, if a SNP's 95% credible interval for alpha or beta doesn't overlap -#' with 0, it is considered to have excess ancestry (positive or negative). -#' Second, if a SNP's alpha or beta falls outside the quantile interval -#' qn/2 and (qn-1)/2, it is considered an outlier (positive or negative). -#' If both criteriea are met for alpha or beta, crazy.a or crazy.b are TRUE. -#' These criteria are discussed in the Gompert BGC papers. -#' -#' Get credible intervals of a BGC parameter -#' @param df data.frame of one BGC parameter (e.g. alpha) -#' @return data.frame with credible intervals for the parameter -#' @noRd -get_cis <- function(df){ - # Get credible interval for parameters. - cis <- apply(df, 1, function(x) bayestestR::ci(x = x, ci=0.95)) - cis <- do.call("rbind", cis) - param.mean <- rowMeans(as.matrix(df)) - param.median <- apply(df, 1, function(x) median(x)) - - df2 <- data.frame(mean=param.mean, - median=param.median, - lb=cis$CI_low, - ub=cis$CI_high, - stringsAsFactors = FALSE) - - # Clear memory. - rm(param.mean, param.median, cis) - gc() - return(df2) -} - -#' Get alpha and beta outliers -#' @param a.out data.frame of alpha BGC results -#' @param b.out data.frame of beta BGC results -#' @param qa.out data.frame of qa (gamma-quantile) BGC results -#' @param qb.out data.frame of qb (zeta-quantile) BGC results -#' @param loci Path to two-column file containing loci names (in BGC order) -#' @param qn Upper quantile interval value -#' @return data.frame containing SNP outlier info -#' @noRd -get_ab_outliers <- function(a.out, b.out, qa.out, qb.out, loci, qn){ - # Get excess ancestry and outliers for alpha and beta parameters. - names(a.out)[3:4] <- c('lb','ub') - names(b.out)[3:4] <- c('lb','ub') - names(qa.out)[3:4] <- c('lb','ub') - names(qb.out)[3:4] <- c('lb','ub') - - # Excess ancestry compared to genome-wide average introgression - # If lb > 0 or ub < 0, it has excess ancestry for P0 or P1, respectively. - # Do alpha. - a.out$excess <- NA - a.out$excess[a.out$lb > 0] <- 'pos' - a.out$excess[a.out$ub < 0] <- 'neg' - - # Beta. - b.out$excess <- NA - b.out$excess[b.out$lb > 0] <- 'pos' - b.out$excess[b.out$ub < 0] <- 'neg' - - # Add in quantiles - a.out$q <- qa.out$mean - b.out$q <- qb.out$mean - - # qnorm takes SD, so take the square root of the quantile estimate. - # This basically generates the interval to check if alpha and beta are - # outliers. - # The interval is defined by qn = n/2 and qn = 1-n/2. - # If the median for alpha or beta fall outside that interval, it's an - # outlier. - # By default, qn is set to 0.025 and 0.975 - qn_lower <- 1 - qn - - a.out$qlb <- qnorm(qn_lower,0,sqrt(a.out$q)) - a.out$qub <- qnorm(qn,0,sqrt(a.out$q)) - - b.out$qlb <- qnorm(qn_lower,0,sqrt(b.out$q)) - b.out$qub <- qnorm(qn,0,sqrt(b.out$q)) - - # Check if median falls outside qn interval. - a.out$outlier <- NA - a.out$outlier[a.out$median > a.out$qub] <- 'pos' - a.out$outlier[a.out$median < a.out$qlb] <- 'neg' - - b.out$outlier <- NA - b.out$outlier[b.out$median > b.out$qub] <- 'pos' - b.out$outlier[b.out$median < b.out$qlb] <- 'neg' - - # Read loci file generated using my vcf2bgc.py script. - # Contains actual scaffold names. - snps <- read.delim(loci, sep = " ", stringsAsFactors = FALSE) - - if (nrow(snps) != nrow(a.out)){ - writeLines(paste0("\n\nError: There are ", nrow(snps), " loci ")) - writeLines("in the loci.file") - stop("loci.file length must equal rows in the BGC output") - } - - ### Copy the data to snps data.frame. - # Mean a and b - snps$alpha <- a.out$mean - snps$beta <- b.out$mean - - # Excess ancestry and outliers. - snps$alpha.excess <- a.out$excess - snps$beta.excess <- b.out$excess - snps$alpha.outlier <- a.out$outlier - snps$beta.outlier <- b.out$outlier - - # If has both excess ancestry AND is an outlier. - snps$crazy.a <- !is.na(snps$alpha.excess) & !is.na(snps$alpha.outlier) - snps$crazy.b <- !is.na(snps$beta.excess) & !is.na(snps$beta.outlier) - - # Clear memory. - rm(a.out, b.out, qa.out, qb.out, loci) - gc() - - return(snps) -} - -#' Gets hybrid index mean across MCMC iterations for each individual -#' @param df data.frame of BGC hybrid index results -#' @return Matrix containing MCMC hybrid index means -#' @noRd -get_hi <- function(df){ - hi.mean <- rowMeans(as.matrix(df)) - return(hi.mean) -} - -#' Parses BGC output and identifies outlier SNPs -#' @param df.list List of data.frame objects from combine_bgc_output() -#' @param admix.pop String identifying admixed population in population map -#' file -#' @param popmap Population map file consisting of two tab-separated columns: -#' indID popID (no header line) -#' @param qn Upper quantile interval value for determining outlier SNPs. -#' Default = 0.975 -#' @param loci.file Path to two-column tab-separated file containing loci names -#' (in same order as BGC input). Column 1 should be locus -#' name, column2 should be the position of the SNP. -#' header line should be: #CHROM POS -#' @param save.obj File path. If supplied, saves outlier info as .RDS object. -#' The value for save.obj should be the filename for the -#' RDS object -#' @return List containing 3 data.frame objects. 1) outlier info, -#' 2)popmap info, 3) hybrid index info -#' @export -#' @examples -#' outliers <- get_bgc_outliers(df.list = aggregated.results, -#' admix.pop = "population1", -#' popmap = "./popmap.txt", -#' qn = 0.975, -#' loci.file = "./bgc_loci.txt", -#' save.obj = "./outlierInfo.rds") -get_bgc_outliers <- function(df.list, - admix.pop, - popmap, - qn = 0.975, - loci.file = NULL, - save.obj = NULL){ - - # Get 95% credible intervals for alpha, beta, gamma-quantile (qa), - # zeta-quantile (qb). - a <- get_cis(df.list[[2]]) - gc() - b <- get_cis(df.list[[3]]) - gc() - qa <- get_cis(df.list[[4]]) - gc() - qb <- get_cis(df.list[[5]]) - gc() - - clean=FALSE - #if no loci file provided, make a spoof one here - if (is.null(loci.file)){ - clean=TRUE - spoof<-data.frame("#CHROM" = 0:((nrow(df.list[[2]][,1])-1)), "POS"=0:((nrow(df.list[[2]][,1])-1)), check.names = FALSE) - readr::write_delim(spoof, "loci_map.txt", delim=" ", col_names=TRUE) - loci.file="loci_map.txt" - } - - if (!file.exists(loci.file)){ - stop(paste0("Error: The loci file ", loci.file, " does not exist!")) - } - - if (!file.exists(popmap)){ - stop(paste0("Error: The popmap file ", popmap, " does not exist!")) - } - - # Find SNPs with excess ancestry and outliers SNPs. - snps <- get_ab_outliers(a, b, qa, qb, loci.file, qn) - gc() - - # Get hybrid indexes. - hi <- get_hi(df.list[[6]]) - - # Read in popmap. - # Tab-separated, two-columns: indID\tpopID - popmap <- read.table(file = popmap, stringsAsFactors = FALSE) - - # Get data.frame with indIDs of admixed individuals and hybrid index. - admix <- popmap[ which(popmap$V2 == admix.pop),] - hi.df <- data.frame(id=admix$V1, hi=hi, stringsAsFactors = FALSE) - - res <- list(snps, popmap, hi.df) - - rm(a, b, qa, qb, snps, hi, popmap, admix, hi.df) - gc() - # If user specified filepath: save res as rds object. - if(!is.null(save.obj)){ - saveRDS(res, save.obj) - } - - gc() - - #clean up spoofed loci_map - if (clean == TRUE){ - file.remove(loci.file) - } - - return(res) - -} +#' Identify Outlier BGC SNPs Using Two Criteria +#' +#' These functions identify alpha and beta BGC outliers using two criteria. +#' First, if a SNP's 95% credible interval for alpha or beta doesn't overlap +#' with 0, it is considered to have excess ancestry (positive or negative). +#' Second, if a SNP's alpha or beta falls outside the quantile interval +#' qn/2 and (qn-1)/2, it is considered an outlier (positive or negative). +#' If both criteriea are met for alpha or beta, crazy.a or crazy.b are TRUE. +#' These criteria are discussed in the Gompert BGC papers. +#' +#' Get credible intervals of a BGC parameter +#' @param df data.frame of one BGC parameter (e.g. alpha) +#' @return data.frame with credible intervals for the parameter +#' @noRd +get_cis <- function(df){ + # Get credible interval for parameters. + cis <- apply(df, 1, function(x) bayestestR::ci(x = x, ci=0.95)) + cis <- do.call("rbind", cis) + param.mean <- rowMeans(as.matrix(df)) + param.median <- apply(df, 1, function(x) median(x)) + + df2 <- data.frame(mean=param.mean, + median=param.median, + lb=cis$CI_low, + ub=cis$CI_high, + stringsAsFactors = FALSE) + + # Clear memory. + rm(param.mean, param.median, cis) + gc() + return(df2) +} + +#' Get alpha and beta outliers +#' @param a.out data.frame of alpha BGC results +#' @param b.out data.frame of beta BGC results +#' @param qa.out data.frame of qa (gamma-quantile) BGC results +#' @param qb.out data.frame of qb (zeta-quantile) BGC results +#' @param loci Path to two-column file containing loci names (in BGC order) +#' @param qn Upper quantile interval value +#' @return data.frame containing SNP outlier info +#' @noRd +get_ab_outliers <- function(a.out, b.out, qa.out, qb.out, loci, qn, ta, tb){ + # Get excess ancestry and outliers for alpha and beta parameters. + names(a.out)[3:4] <- c('lb','ub') + names(b.out)[3:4] <- c('lb','ub') + names(qa.out)[3:4] <- c('lb','ub') + names(qb.out)[3:4] <- c('lb','ub') + + # Excess ancestry compared to genome-wide average introgression + # If lb > 0 or ub < 0, it has excess ancestry for P0 or P1, respectively. + # Do alpha. + a.out$excess <- NA + a.out$excess[a.out$lb > 0] <- 'pos' + a.out$excess[a.out$ub < 0] <- 'neg' + + # Beta. + b.out$excess <- NA + b.out$excess[b.out$lb > 0] <- 'pos' + b.out$excess[b.out$ub < 0] <- 'neg' + + # Add in quantiles + a.out$q <- qa.out$mean + b.out$q <- qb.out$mean + + # qnorm takes SD, so take square root of the reciprocal of tau. + # This basically generates the interval to check if alpha and beta are + # outliers. + # The interval is defined by qn = n/2 and qn = 1-n/2. + # If the median for alpha or beta fall outside that interval, it's an + # outlier. + # By default, qn is set to 0.025 and 0.975 + qn_lower <- 1 - qn + + a.out$qlb <- qnorm(qn_lower,0,sqrt(1/ta)) + a.out$qub <- qnorm(qn,0,sqrt(1/ta)) + + b.out$qlb <- qnorm(qn_lower,0,sqrt(1/tb)) + b.out$qub <- qnorm(qn,0,sqrt(1/tb)) + + + # Check if median falls outside qn interval. + a.out$outlier <- NA + a.out$outlier[a.out$median > a.out$qub] <- 'pos' + a.out$outlier[a.out$median < a.out$qlb] <- 'neg' + + b.out$outlier <- NA + b.out$outlier[b.out$median > b.out$qub] <- 'pos' + b.out$outlier[b.out$median < b.out$qlb] <- 'neg' + + # Read loci file generated using my vcf2bgc.py script. + # Contains actual scaffold names. + snps <- read.delim(loci, sep = " ", stringsAsFactors = FALSE) + + if (nrow(snps) != nrow(a.out)){ + writeLines(paste0("\n\nError: There are ", nrow(snps), " loci ")) + writeLines("in the loci.file") + stop("loci.file length must equal rows in the BGC output") + } + + ### Copy the data to snps data.frame. + # Mean a and b + snps$alpha <- a.out$mean + snps$beta <- b.out$mean + + # Excess ancestry and outliers. + snps$alpha.excess <- a.out$excess + snps$beta.excess <- b.out$excess + snps$alpha.outlier <- a.out$outlier + snps$beta.outlier <- b.out$outlier + + # If has both excess ancestry AND is an outlier. + snps$crazy.a <- !is.na(snps$alpha.excess) & !is.na(snps$alpha.outlier) + snps$crazy.b <- !is.na(snps$beta.excess) & !is.na(snps$beta.outlier) + + # Clear memory. + rm(a.out, b.out, qa.out, qb.out, loci) + gc() + + return(snps) +} + +#' Gets hybrid index mean across MCMC iterations for each individual +#' @param df data.frame of BGC hybrid index results +#' @return Matrix containing MCMC hybrid index means +#' @noRd +get_hi <- function(df){ + hi.mean <- rowMeans(as.matrix(df)) + return(hi.mean) +} + +#' Parses BGC output and identifies outlier SNPs +#' @param df.list List of data.frame objects from combine_bgc_output() +#' @param admix.pop String identifying admixed population in population map +#' file +#' @param popmap Population map file consisting of two tab-separated columns: +#' indID popID (no header line) +#' @param qn Upper quantile interval value for determining outlier SNPs. +#' Default = 0.975 +#' @param loci.file Path to two-column tab-separated file containing loci names +#' (in same order as BGC input). Column 1 should be locus +#' name, column2 should be the position of the SNP. +#' header line should be: #CHROM POS +#' @param save.obj File path. If supplied, saves outlier info as .RDS object. +#' The value for save.obj should be the filename for the +#' RDS object +#' @return List containing 3 data.frame objects. 1) outlier info, +#' 2)popmap info, 3) hybrid index info +#' @export +#' @examples +#' outliers <- get_bgc_outliers(df.list = aggregated.results, +#' admix.pop = "population1", +#' popmap = "./popmap.txt", +#' qn = 0.975, +#' loci.file = "./bgc_loci.txt", +#' save.obj = "./outlierInfo.rds") +get_bgc_outliers <- function(df.list, + admix.pop, + popmap, + qn = 0.975, + loci.file = NULL, + save.obj = NULL){ + + # Get 95% credible intervals for alpha, beta, gamma-quantile (qa), + # zeta-quantile (qb). + a <- get_cis(df.list[[2]]) + gc() + b <- get_cis(df.list[[3]]) + gc() + qa <- get_cis(df.list[[4]]) + gc() + qb <- get_cis(df.list[[5]]) + gc() + ta <- mean(unlist(bgc.genes[["ta"]])) # get estimate of Tau-alpha + tb <- mean(unlist(bgc.genes[["tb"]])) # get estimate of Tau-beta + + clean=FALSE + #if no loci file provided, make a spoof one here + if (is.null(loci.file)){ + clean=TRUE + spoof<-data.frame("#CHROM" = 0:((nrow(df.list[[2]][,1])-1)), "POS"=0:((nrow(df.list[[2]][,1])-1)), check.names = FALSE) + readr::write_delim(spoof, "loci_map.txt", delim=" ", col_names=TRUE) + loci.file="loci_map.txt" + } + + if (!file.exists(loci.file)){ + stop(paste0("Error: The loci file ", loci.file, " does not exist!")) + } + + if (!file.exists(popmap)){ + stop(paste0("Error: The popmap file ", popmap, " does not exist!")) + } + + # Find SNPs with excess ancestry and outliers SNPs. + snps <- get_ab_outliers(a, b, qa, qb, loci.file, qn, ta, tb) + gc() + + # Get hybrid indexes. + hi <- get_hi(df.list[[6]]) + + # Read in popmap. + # Tab-separated, two-columns: indID\tpopID + popmap <- read.table(file = popmap, stringsAsFactors = FALSE) + + # Get data.frame with indIDs of admixed individuals and hybrid index. + admix <- popmap[ which(popmap$V2 == admix.pop),] + hi.df <- data.frame(id=admix$V1, hi=hi, stringsAsFactors = FALSE) + + res <- list(snps, popmap, hi.df) + + rm(a, b, qa, qb, snps, hi, popmap, admix, hi.df) + gc() + # If user specified filepath: save res as rds object. + if(!is.null(save.obj)){ + saveRDS(res, save.obj) + } + + gc() + + #clean up spoofed loci_map + if (clean == TRUE){ + file.remove(loci.file) + } + + return(res) + +} diff --git a/R/plot_traces.R b/R/plot_traces.R index fd9de03..2075f04 100644 --- a/R/plot_traces.R +++ b/R/plot_traces.R @@ -50,6 +50,18 @@ plot_traces <- function(df.list, prefix, plotDIR = "./plots", showPLOTS=FALSE){ prefix = prefix, plotDIR = plotDIR, showPLOTS=showPLOTS) + plotBGCparams(df = df.list[[7]], + ylab = "Tau-alpha", + bgc_param = "ta", + prefix = prefix, + plotDIR = plotDIR, + showPLOTS=showPLOTS) + plotBGCparams(df = df.list[[8]], + ylab = "Tau-beta", + bgc_param = "tb", + prefix = prefix, + plotDIR = plotDIR, + showPLOTS=showPLOTS) writeLines(paste("Saved trace plots to", plotDIR)) diff --git a/scripts/run_bgc.sh b/scripts/run_bgc.sh old mode 100644 new mode 100755 index 794d8ae..e3524a1 --- a/scripts/run_bgc.sh +++ b/scripts/run_bgc.sh @@ -199,6 +199,14 @@ $ESTPOST_PATH/estpost -i ${RESULTS_DIR}/${prefix}_mcmcout_${run}.hdf5 \ $ESTPOST_PATH/estpost -i ${RESULTS_DIR}/${prefix}_mcmcout_${run}.hdf5 \ -p zeta-quantile -o ${RESULTS_DIR}/${prefix}_stat_qb_${run} -s 2 -w 0 +# Get tau-alpha (ta) +$ESTPOST_PATH/estpost -i ${RESULTS_DIR}/${prefix}_mcmcout_${run}.hdf5 \ + -p tau-alpha -o ${RESULTS_DIR}/${prefix}_stat_ta_${run} -s 2 -w 0 + +# Get tau-beta (tb) +$ESTPOST_PATH/estpost -i ${RESULTS_DIR}/${prefix}_mcmcout_${run}.hdf5 \ + -p tau-beta -o ${RESULTS_DIR}/${prefix}_stat_tb_${run} -s 2 -w 0 + echo "Done with estpost!" exit 0 From 1a9129617a33c7e0bb1fa0145bed55d8d768456c Mon Sep 17 00:00:00 2001 From: stevemussmann Date: Thu, 9 Apr 2026 11:37:38 -0700 Subject: [PATCH 2/2] unix line breaks --- R/combine_bgc_output.R | 350 +++++++++++++++---------------- R/get_bgc_outliers.R | 454 ++++++++++++++++++++--------------------- 2 files changed, 402 insertions(+), 402 deletions(-) diff --git a/R/combine_bgc_output.R b/R/combine_bgc_output.R index 9e03505..05ef810 100644 --- a/R/combine_bgc_output.R +++ b/R/combine_bgc_output.R @@ -1,175 +1,175 @@ -#' Aggregate BGC Output -#' -#' This function aggregates multiple BGC runs into one file. Runs are joined -#' by columns and must all contain the same number of post-burnin MCMC -#' iterations. It assumes that input files are contained in a single directory -#' and have specific file suffixes. See the README.md file for details on -#' file suffixes. Files can have population prefixes. -#' -#' @param results.dir Path to a directory containing all the BGC results -#' @param prefix Prefix to the input files; e.g., "population1" -#' @param thin Thin to every n MCMC samples. E.g. thin=2 cuts samples in half -#' @param discard Discard first N samples from each BGC run -#' @return A list of data.frames (each data.frame is a BGC parameter) -#' @export -#' @examples -#' aggregate.results <- combine_bgc_output(results.dir = "./results", -#' prefix = "population1") -#' -#' aggregate.results <- combine_bgc_output(results.dir = "./results", -#' prefix = "population2", -#' thin = 2) -#' -#' aggregate.results <- combine_bgc_output(results.dir = "./results", -#' prefix = "pop3", -#' discard = 2000) -combine_bgc_output <- function(results.dir, - prefix, - thin = NULL, - discard = NULL){ - - writeLines(paste0("\n\nLoading input files with prefix ", prefix, "...\n")) - - lnl <- list.files(path = results.dir, - pattern = paste0(prefix, ".*lnl_\\d+$"), - full.names = TRUE) - check_if_files(lnl) - - a0 <- list.files(path = results.dir, - pattern = paste0(prefix, ".*a0_\\d+$"), - full.names = TRUE) - check_if_files(a0) - - b0 <- list.files(path = results.dir, - pattern = paste0(prefix, ".*b0_\\d+$"), - full.names = TRUE) - check_if_files(b0) - - qa <- list.files(path = results.dir, - pattern = paste0(prefix, ".*qa_\\d+$"), - full.names = TRUE) - check_if_files(qa) - - qb <- list.files(path = results.dir, - pattern = paste0(prefix, ".*qb_\\d+$"), - full.names = TRUE) - check_if_files(qb) - - hi <- list.files(path = results.dir, - pattern = paste0(prefix, ".*hi_\\d+$"), - full.names = TRUE) - check_if_files(hi) - - ta <- list.files(path = results.dir, - pattern = paste0(prefix, ".*ta_\\d+$"), - full.names = TRUE) - check_if_files(ta) - - tb <- list.files(path = results.dir, - pattern = paste0(prefix, ".*tb_\\d+$"), - full.names = TRUE) - check_if_files(tb) - - gc() - - if (!countBGCruns(list(lnl, a0, b0, qa, qb, hi, ta, tb))){ - stop("Error: All parameters must have the same number of BGC runs!") - } - - if (!is.null(thin)){ - writeLines(paste0("\n\nThinning MCMC samples with frequency: ", thin)) - } - if (!is.null(discard)){ - writeLines(paste0("\n\nDiscarding samples as burnin: ", discard)) - } - - # Aggregate the BGC runs into one data.frame - lnl.df <- bind_bgc_runs(lnl, thin = thin, discard = discard) - gc() - a0.df <- bind_bgc_runs(a0, thin = thin, discard = discard) - gc() - b0.df <- bind_bgc_runs(b0, thin = thin, discard = discard) - gc() - qa.df <- bind_bgc_runs(qa, thin = thin, discard = discard) - gc() - qb.df <- bind_bgc_runs(qb, thin = thin, discard = discard) - gc() - hi.df <- bind_bgc_runs(hi, thin = thin, discard = discard) - gc() - ta.df <- bind_bgc_runs(ta, thin = thin, discard = discard) - gc() - tb.df <- bind_bgc_runs(tb, thin = thin, discard = discard) - gc() - - writeLines(paste0("\n\n # MCMC samples = ", ncol(lnl.df))) - - -writeLines(paste0("\n\nDone! Found ", length(lnl), " BGC runs.\n\n")) - -df.list <- - list(lnl=lnl.df, a0=a0.df, b0=b0.df, qa=qa.df, qb=qb.df, hi=hi.df, ta=ta.df, tb=tb.df) - -return(df.list) -} - -#' Validates that input files were found -#' @param files A vector of file paths -#' @noRd -check_if_files <- function(files){ - # Make sure list.files found the correct input files. - if (length(files) == 0){ - stop(paste0("Error: The files ", - files, - " were not found in the input directory.")) - } -} - -#' Combine bgc runs together by binding columns (uses dplyr and data.table) -#' @param files A vector of file paths -#' @param thin Boolean. If TRUE, thins MCMC to every nth sample -#' @noRd -bind_bgc_runs <- function(files, thin = NULL, discard = NULL){ - - data_list <- lapply(files, data.table::fread, stringsAsFactors = FALSE) - - if (!is.null(thin)){ - data_list <- lapply(data_list, function(x) thin_MCMC(x, thin)) - } - if (!is.null(discard)){ - data_list <- lapply(data_list, function(x) discardNsamples(x, discard)) - } - - df <- dplyr::bind_cols(data_list) - - rm(data_list) - gc() - return(df) -} - -#' Validate that all parameters have same number of BGC runs -#' @param l List of filepath vectors -#' @noRd -countBGCruns <- function(l){ - return(length(unique(sapply(l, length))) == 1) -} - -#' Thin MCMC iterations every Nth iteration. -#' @param df data.frame with each MCMC sample as a column -#' @param n Only keep every nth iteration -#' @noRd -thin_MCMC <- function(df, n){ - df.new <- df[ ,seq(from = 1, to = ncol(df), by = n)] - rm(df) - gc() - return(df.new) -} - -#' Discard first N samples of each run -#' @param df data.frame containing -#' @noRd -discardNsamples <- function(df, n){ - df.new <- df[,(n+1):ncol(df)] - rm(df) - gc() - return(df.new) -} +#' Aggregate BGC Output +#' +#' This function aggregates multiple BGC runs into one file. Runs are joined +#' by columns and must all contain the same number of post-burnin MCMC +#' iterations. It assumes that input files are contained in a single directory +#' and have specific file suffixes. See the README.md file for details on +#' file suffixes. Files can have population prefixes. +#' +#' @param results.dir Path to a directory containing all the BGC results +#' @param prefix Prefix to the input files; e.g., "population1" +#' @param thin Thin to every n MCMC samples. E.g. thin=2 cuts samples in half +#' @param discard Discard first N samples from each BGC run +#' @return A list of data.frames (each data.frame is a BGC parameter) +#' @export +#' @examples +#' aggregate.results <- combine_bgc_output(results.dir = "./results", +#' prefix = "population1") +#' +#' aggregate.results <- combine_bgc_output(results.dir = "./results", +#' prefix = "population2", +#' thin = 2) +#' +#' aggregate.results <- combine_bgc_output(results.dir = "./results", +#' prefix = "pop3", +#' discard = 2000) +combine_bgc_output <- function(results.dir, + prefix, + thin = NULL, + discard = NULL){ + + writeLines(paste0("\n\nLoading input files with prefix ", prefix, "...\n")) + + lnl <- list.files(path = results.dir, + pattern = paste0(prefix, ".*lnl_\\d+$"), + full.names = TRUE) + check_if_files(lnl) + + a0 <- list.files(path = results.dir, + pattern = paste0(prefix, ".*a0_\\d+$"), + full.names = TRUE) + check_if_files(a0) + + b0 <- list.files(path = results.dir, + pattern = paste0(prefix, ".*b0_\\d+$"), + full.names = TRUE) + check_if_files(b0) + + qa <- list.files(path = results.dir, + pattern = paste0(prefix, ".*qa_\\d+$"), + full.names = TRUE) + check_if_files(qa) + + qb <- list.files(path = results.dir, + pattern = paste0(prefix, ".*qb_\\d+$"), + full.names = TRUE) + check_if_files(qb) + + hi <- list.files(path = results.dir, + pattern = paste0(prefix, ".*hi_\\d+$"), + full.names = TRUE) + check_if_files(hi) + + ta <- list.files(path = results.dir, + pattern = paste0(prefix, ".*ta_\\d+$"), + full.names = TRUE) + check_if_files(ta) + + tb <- list.files(path = results.dir, + pattern = paste0(prefix, ".*tb_\\d+$"), + full.names = TRUE) + check_if_files(tb) + + gc() + + if (!countBGCruns(list(lnl, a0, b0, qa, qb, hi, ta, tb))){ + stop("Error: All parameters must have the same number of BGC runs!") + } + + if (!is.null(thin)){ + writeLines(paste0("\n\nThinning MCMC samples with frequency: ", thin)) + } + if (!is.null(discard)){ + writeLines(paste0("\n\nDiscarding samples as burnin: ", discard)) + } + + # Aggregate the BGC runs into one data.frame + lnl.df <- bind_bgc_runs(lnl, thin = thin, discard = discard) + gc() + a0.df <- bind_bgc_runs(a0, thin = thin, discard = discard) + gc() + b0.df <- bind_bgc_runs(b0, thin = thin, discard = discard) + gc() + qa.df <- bind_bgc_runs(qa, thin = thin, discard = discard) + gc() + qb.df <- bind_bgc_runs(qb, thin = thin, discard = discard) + gc() + hi.df <- bind_bgc_runs(hi, thin = thin, discard = discard) + gc() + ta.df <- bind_bgc_runs(ta, thin = thin, discard = discard) + gc() + tb.df <- bind_bgc_runs(tb, thin = thin, discard = discard) + gc() + + writeLines(paste0("\n\n # MCMC samples = ", ncol(lnl.df))) + + +writeLines(paste0("\n\nDone! Found ", length(lnl), " BGC runs.\n\n")) + +df.list <- + list(lnl=lnl.df, a0=a0.df, b0=b0.df, qa=qa.df, qb=qb.df, hi=hi.df, ta=ta.df, tb=tb.df) + +return(df.list) +} + +#' Validates that input files were found +#' @param files A vector of file paths +#' @noRd +check_if_files <- function(files){ + # Make sure list.files found the correct input files. + if (length(files) == 0){ + stop(paste0("Error: The files ", + files, + " were not found in the input directory.")) + } +} + +#' Combine bgc runs together by binding columns (uses dplyr and data.table) +#' @param files A vector of file paths +#' @param thin Boolean. If TRUE, thins MCMC to every nth sample +#' @noRd +bind_bgc_runs <- function(files, thin = NULL, discard = NULL){ + + data_list <- lapply(files, data.table::fread, stringsAsFactors = FALSE) + + if (!is.null(thin)){ + data_list <- lapply(data_list, function(x) thin_MCMC(x, thin)) + } + if (!is.null(discard)){ + data_list <- lapply(data_list, function(x) discardNsamples(x, discard)) + } + + df <- dplyr::bind_cols(data_list) + + rm(data_list) + gc() + return(df) +} + +#' Validate that all parameters have same number of BGC runs +#' @param l List of filepath vectors +#' @noRd +countBGCruns <- function(l){ + return(length(unique(sapply(l, length))) == 1) +} + +#' Thin MCMC iterations every Nth iteration. +#' @param df data.frame with each MCMC sample as a column +#' @param n Only keep every nth iteration +#' @noRd +thin_MCMC <- function(df, n){ + df.new <- df[ ,seq(from = 1, to = ncol(df), by = n)] + rm(df) + gc() + return(df.new) +} + +#' Discard first N samples of each run +#' @param df data.frame containing +#' @noRd +discardNsamples <- function(df, n){ + df.new <- df[,(n+1):ncol(df)] + rm(df) + gc() + return(df.new) +} diff --git a/R/get_bgc_outliers.R b/R/get_bgc_outliers.R index c1e6801..71a73d5 100644 --- a/R/get_bgc_outliers.R +++ b/R/get_bgc_outliers.R @@ -1,227 +1,227 @@ -#' Identify Outlier BGC SNPs Using Two Criteria -#' -#' These functions identify alpha and beta BGC outliers using two criteria. -#' First, if a SNP's 95% credible interval for alpha or beta doesn't overlap -#' with 0, it is considered to have excess ancestry (positive or negative). -#' Second, if a SNP's alpha or beta falls outside the quantile interval -#' qn/2 and (qn-1)/2, it is considered an outlier (positive or negative). -#' If both criteriea are met for alpha or beta, crazy.a or crazy.b are TRUE. -#' These criteria are discussed in the Gompert BGC papers. -#' -#' Get credible intervals of a BGC parameter -#' @param df data.frame of one BGC parameter (e.g. alpha) -#' @return data.frame with credible intervals for the parameter -#' @noRd -get_cis <- function(df){ - # Get credible interval for parameters. - cis <- apply(df, 1, function(x) bayestestR::ci(x = x, ci=0.95)) - cis <- do.call("rbind", cis) - param.mean <- rowMeans(as.matrix(df)) - param.median <- apply(df, 1, function(x) median(x)) - - df2 <- data.frame(mean=param.mean, - median=param.median, - lb=cis$CI_low, - ub=cis$CI_high, - stringsAsFactors = FALSE) - - # Clear memory. - rm(param.mean, param.median, cis) - gc() - return(df2) -} - -#' Get alpha and beta outliers -#' @param a.out data.frame of alpha BGC results -#' @param b.out data.frame of beta BGC results -#' @param qa.out data.frame of qa (gamma-quantile) BGC results -#' @param qb.out data.frame of qb (zeta-quantile) BGC results -#' @param loci Path to two-column file containing loci names (in BGC order) -#' @param qn Upper quantile interval value -#' @return data.frame containing SNP outlier info -#' @noRd -get_ab_outliers <- function(a.out, b.out, qa.out, qb.out, loci, qn, ta, tb){ - # Get excess ancestry and outliers for alpha and beta parameters. - names(a.out)[3:4] <- c('lb','ub') - names(b.out)[3:4] <- c('lb','ub') - names(qa.out)[3:4] <- c('lb','ub') - names(qb.out)[3:4] <- c('lb','ub') - - # Excess ancestry compared to genome-wide average introgression - # If lb > 0 or ub < 0, it has excess ancestry for P0 or P1, respectively. - # Do alpha. - a.out$excess <- NA - a.out$excess[a.out$lb > 0] <- 'pos' - a.out$excess[a.out$ub < 0] <- 'neg' - - # Beta. - b.out$excess <- NA - b.out$excess[b.out$lb > 0] <- 'pos' - b.out$excess[b.out$ub < 0] <- 'neg' - - # Add in quantiles - a.out$q <- qa.out$mean - b.out$q <- qb.out$mean - - # qnorm takes SD, so take square root of the reciprocal of tau. - # This basically generates the interval to check if alpha and beta are - # outliers. - # The interval is defined by qn = n/2 and qn = 1-n/2. - # If the median for alpha or beta fall outside that interval, it's an - # outlier. - # By default, qn is set to 0.025 and 0.975 - qn_lower <- 1 - qn - - a.out$qlb <- qnorm(qn_lower,0,sqrt(1/ta)) - a.out$qub <- qnorm(qn,0,sqrt(1/ta)) - - b.out$qlb <- qnorm(qn_lower,0,sqrt(1/tb)) - b.out$qub <- qnorm(qn,0,sqrt(1/tb)) - - - # Check if median falls outside qn interval. - a.out$outlier <- NA - a.out$outlier[a.out$median > a.out$qub] <- 'pos' - a.out$outlier[a.out$median < a.out$qlb] <- 'neg' - - b.out$outlier <- NA - b.out$outlier[b.out$median > b.out$qub] <- 'pos' - b.out$outlier[b.out$median < b.out$qlb] <- 'neg' - - # Read loci file generated using my vcf2bgc.py script. - # Contains actual scaffold names. - snps <- read.delim(loci, sep = " ", stringsAsFactors = FALSE) - - if (nrow(snps) != nrow(a.out)){ - writeLines(paste0("\n\nError: There are ", nrow(snps), " loci ")) - writeLines("in the loci.file") - stop("loci.file length must equal rows in the BGC output") - } - - ### Copy the data to snps data.frame. - # Mean a and b - snps$alpha <- a.out$mean - snps$beta <- b.out$mean - - # Excess ancestry and outliers. - snps$alpha.excess <- a.out$excess - snps$beta.excess <- b.out$excess - snps$alpha.outlier <- a.out$outlier - snps$beta.outlier <- b.out$outlier - - # If has both excess ancestry AND is an outlier. - snps$crazy.a <- !is.na(snps$alpha.excess) & !is.na(snps$alpha.outlier) - snps$crazy.b <- !is.na(snps$beta.excess) & !is.na(snps$beta.outlier) - - # Clear memory. - rm(a.out, b.out, qa.out, qb.out, loci) - gc() - - return(snps) -} - -#' Gets hybrid index mean across MCMC iterations for each individual -#' @param df data.frame of BGC hybrid index results -#' @return Matrix containing MCMC hybrid index means -#' @noRd -get_hi <- function(df){ - hi.mean <- rowMeans(as.matrix(df)) - return(hi.mean) -} - -#' Parses BGC output and identifies outlier SNPs -#' @param df.list List of data.frame objects from combine_bgc_output() -#' @param admix.pop String identifying admixed population in population map -#' file -#' @param popmap Population map file consisting of two tab-separated columns: -#' indID popID (no header line) -#' @param qn Upper quantile interval value for determining outlier SNPs. -#' Default = 0.975 -#' @param loci.file Path to two-column tab-separated file containing loci names -#' (in same order as BGC input). Column 1 should be locus -#' name, column2 should be the position of the SNP. -#' header line should be: #CHROM POS -#' @param save.obj File path. If supplied, saves outlier info as .RDS object. -#' The value for save.obj should be the filename for the -#' RDS object -#' @return List containing 3 data.frame objects. 1) outlier info, -#' 2)popmap info, 3) hybrid index info -#' @export -#' @examples -#' outliers <- get_bgc_outliers(df.list = aggregated.results, -#' admix.pop = "population1", -#' popmap = "./popmap.txt", -#' qn = 0.975, -#' loci.file = "./bgc_loci.txt", -#' save.obj = "./outlierInfo.rds") -get_bgc_outliers <- function(df.list, - admix.pop, - popmap, - qn = 0.975, - loci.file = NULL, - save.obj = NULL){ - - # Get 95% credible intervals for alpha, beta, gamma-quantile (qa), - # zeta-quantile (qb). - a <- get_cis(df.list[[2]]) - gc() - b <- get_cis(df.list[[3]]) - gc() - qa <- get_cis(df.list[[4]]) - gc() - qb <- get_cis(df.list[[5]]) - gc() - ta <- mean(unlist(bgc.genes[["ta"]])) # get estimate of Tau-alpha - tb <- mean(unlist(bgc.genes[["tb"]])) # get estimate of Tau-beta - - clean=FALSE - #if no loci file provided, make a spoof one here - if (is.null(loci.file)){ - clean=TRUE - spoof<-data.frame("#CHROM" = 0:((nrow(df.list[[2]][,1])-1)), "POS"=0:((nrow(df.list[[2]][,1])-1)), check.names = FALSE) - readr::write_delim(spoof, "loci_map.txt", delim=" ", col_names=TRUE) - loci.file="loci_map.txt" - } - - if (!file.exists(loci.file)){ - stop(paste0("Error: The loci file ", loci.file, " does not exist!")) - } - - if (!file.exists(popmap)){ - stop(paste0("Error: The popmap file ", popmap, " does not exist!")) - } - - # Find SNPs with excess ancestry and outliers SNPs. - snps <- get_ab_outliers(a, b, qa, qb, loci.file, qn, ta, tb) - gc() - - # Get hybrid indexes. - hi <- get_hi(df.list[[6]]) - - # Read in popmap. - # Tab-separated, two-columns: indID\tpopID - popmap <- read.table(file = popmap, stringsAsFactors = FALSE) - - # Get data.frame with indIDs of admixed individuals and hybrid index. - admix <- popmap[ which(popmap$V2 == admix.pop),] - hi.df <- data.frame(id=admix$V1, hi=hi, stringsAsFactors = FALSE) - - res <- list(snps, popmap, hi.df) - - rm(a, b, qa, qb, snps, hi, popmap, admix, hi.df) - gc() - # If user specified filepath: save res as rds object. - if(!is.null(save.obj)){ - saveRDS(res, save.obj) - } - - gc() - - #clean up spoofed loci_map - if (clean == TRUE){ - file.remove(loci.file) - } - - return(res) - -} +#' Identify Outlier BGC SNPs Using Two Criteria +#' +#' These functions identify alpha and beta BGC outliers using two criteria. +#' First, if a SNP's 95% credible interval for alpha or beta doesn't overlap +#' with 0, it is considered to have excess ancestry (positive or negative). +#' Second, if a SNP's alpha or beta falls outside the quantile interval +#' qn/2 and (qn-1)/2, it is considered an outlier (positive or negative). +#' If both criteriea are met for alpha or beta, crazy.a or crazy.b are TRUE. +#' These criteria are discussed in the Gompert BGC papers. +#' +#' Get credible intervals of a BGC parameter +#' @param df data.frame of one BGC parameter (e.g. alpha) +#' @return data.frame with credible intervals for the parameter +#' @noRd +get_cis <- function(df){ + # Get credible interval for parameters. + cis <- apply(df, 1, function(x) bayestestR::ci(x = x, ci=0.95)) + cis <- do.call("rbind", cis) + param.mean <- rowMeans(as.matrix(df)) + param.median <- apply(df, 1, function(x) median(x)) + + df2 <- data.frame(mean=param.mean, + median=param.median, + lb=cis$CI_low, + ub=cis$CI_high, + stringsAsFactors = FALSE) + + # Clear memory. + rm(param.mean, param.median, cis) + gc() + return(df2) +} + +#' Get alpha and beta outliers +#' @param a.out data.frame of alpha BGC results +#' @param b.out data.frame of beta BGC results +#' @param qa.out data.frame of qa (gamma-quantile) BGC results +#' @param qb.out data.frame of qb (zeta-quantile) BGC results +#' @param loci Path to two-column file containing loci names (in BGC order) +#' @param qn Upper quantile interval value +#' @return data.frame containing SNP outlier info +#' @noRd +get_ab_outliers <- function(a.out, b.out, qa.out, qb.out, loci, qn, ta, tb){ + # Get excess ancestry and outliers for alpha and beta parameters. + names(a.out)[3:4] <- c('lb','ub') + names(b.out)[3:4] <- c('lb','ub') + names(qa.out)[3:4] <- c('lb','ub') + names(qb.out)[3:4] <- c('lb','ub') + + # Excess ancestry compared to genome-wide average introgression + # If lb > 0 or ub < 0, it has excess ancestry for P0 or P1, respectively. + # Do alpha. + a.out$excess <- NA + a.out$excess[a.out$lb > 0] <- 'pos' + a.out$excess[a.out$ub < 0] <- 'neg' + + # Beta. + b.out$excess <- NA + b.out$excess[b.out$lb > 0] <- 'pos' + b.out$excess[b.out$ub < 0] <- 'neg' + + # Add in quantiles + a.out$q <- qa.out$mean + b.out$q <- qb.out$mean + + # qnorm takes SD, so take square root of the reciprocal of tau. + # This basically generates the interval to check if alpha and beta are + # outliers. + # The interval is defined by qn = n/2 and qn = 1-n/2. + # If the median for alpha or beta fall outside that interval, it's an + # outlier. + # By default, qn is set to 0.025 and 0.975 + qn_lower <- 1 - qn + + a.out$qlb <- qnorm(qn_lower,0,sqrt(1/ta)) + a.out$qub <- qnorm(qn,0,sqrt(1/ta)) + + b.out$qlb <- qnorm(qn_lower,0,sqrt(1/tb)) + b.out$qub <- qnorm(qn,0,sqrt(1/tb)) + + + # Check if median falls outside qn interval. + a.out$outlier <- NA + a.out$outlier[a.out$median > a.out$qub] <- 'pos' + a.out$outlier[a.out$median < a.out$qlb] <- 'neg' + + b.out$outlier <- NA + b.out$outlier[b.out$median > b.out$qub] <- 'pos' + b.out$outlier[b.out$median < b.out$qlb] <- 'neg' + + # Read loci file generated using my vcf2bgc.py script. + # Contains actual scaffold names. + snps <- read.delim(loci, sep = " ", stringsAsFactors = FALSE) + + if (nrow(snps) != nrow(a.out)){ + writeLines(paste0("\n\nError: There are ", nrow(snps), " loci ")) + writeLines("in the loci.file") + stop("loci.file length must equal rows in the BGC output") + } + + ### Copy the data to snps data.frame. + # Mean a and b + snps$alpha <- a.out$mean + snps$beta <- b.out$mean + + # Excess ancestry and outliers. + snps$alpha.excess <- a.out$excess + snps$beta.excess <- b.out$excess + snps$alpha.outlier <- a.out$outlier + snps$beta.outlier <- b.out$outlier + + # If has both excess ancestry AND is an outlier. + snps$crazy.a <- !is.na(snps$alpha.excess) & !is.na(snps$alpha.outlier) + snps$crazy.b <- !is.na(snps$beta.excess) & !is.na(snps$beta.outlier) + + # Clear memory. + rm(a.out, b.out, qa.out, qb.out, loci) + gc() + + return(snps) +} + +#' Gets hybrid index mean across MCMC iterations for each individual +#' @param df data.frame of BGC hybrid index results +#' @return Matrix containing MCMC hybrid index means +#' @noRd +get_hi <- function(df){ + hi.mean <- rowMeans(as.matrix(df)) + return(hi.mean) +} + +#' Parses BGC output and identifies outlier SNPs +#' @param df.list List of data.frame objects from combine_bgc_output() +#' @param admix.pop String identifying admixed population in population map +#' file +#' @param popmap Population map file consisting of two tab-separated columns: +#' indID popID (no header line) +#' @param qn Upper quantile interval value for determining outlier SNPs. +#' Default = 0.975 +#' @param loci.file Path to two-column tab-separated file containing loci names +#' (in same order as BGC input). Column 1 should be locus +#' name, column2 should be the position of the SNP. +#' header line should be: #CHROM POS +#' @param save.obj File path. If supplied, saves outlier info as .RDS object. +#' The value for save.obj should be the filename for the +#' RDS object +#' @return List containing 3 data.frame objects. 1) outlier info, +#' 2)popmap info, 3) hybrid index info +#' @export +#' @examples +#' outliers <- get_bgc_outliers(df.list = aggregated.results, +#' admix.pop = "population1", +#' popmap = "./popmap.txt", +#' qn = 0.975, +#' loci.file = "./bgc_loci.txt", +#' save.obj = "./outlierInfo.rds") +get_bgc_outliers <- function(df.list, + admix.pop, + popmap, + qn = 0.975, + loci.file = NULL, + save.obj = NULL){ + + # Get 95% credible intervals for alpha, beta, gamma-quantile (qa), + # zeta-quantile (qb). + a <- get_cis(df.list[[2]]) + gc() + b <- get_cis(df.list[[3]]) + gc() + qa <- get_cis(df.list[[4]]) + gc() + qb <- get_cis(df.list[[5]]) + gc() + ta <- mean(unlist(bgc.genes[["ta"]])) # get estimate of Tau-alpha + tb <- mean(unlist(bgc.genes[["tb"]])) # get estimate of Tau-beta + + clean=FALSE + #if no loci file provided, make a spoof one here + if (is.null(loci.file)){ + clean=TRUE + spoof<-data.frame("#CHROM" = 0:((nrow(df.list[[2]][,1])-1)), "POS"=0:((nrow(df.list[[2]][,1])-1)), check.names = FALSE) + readr::write_delim(spoof, "loci_map.txt", delim=" ", col_names=TRUE) + loci.file="loci_map.txt" + } + + if (!file.exists(loci.file)){ + stop(paste0("Error: The loci file ", loci.file, " does not exist!")) + } + + if (!file.exists(popmap)){ + stop(paste0("Error: The popmap file ", popmap, " does not exist!")) + } + + # Find SNPs with excess ancestry and outliers SNPs. + snps <- get_ab_outliers(a, b, qa, qb, loci.file, qn, ta, tb) + gc() + + # Get hybrid indexes. + hi <- get_hi(df.list[[6]]) + + # Read in popmap. + # Tab-separated, two-columns: indID\tpopID + popmap <- read.table(file = popmap, stringsAsFactors = FALSE) + + # Get data.frame with indIDs of admixed individuals and hybrid index. + admix <- popmap[ which(popmap$V2 == admix.pop),] + hi.df <- data.frame(id=admix$V1, hi=hi, stringsAsFactors = FALSE) + + res <- list(snps, popmap, hi.df) + + rm(a, b, qa, qb, snps, hi, popmap, admix, hi.df) + gc() + # If user specified filepath: save res as rds object. + if(!is.null(save.obj)){ + saveRDS(res, save.obj) + } + + gc() + + #clean up spoofed loci_map + if (clean == TRUE){ + file.remove(loci.file) + } + + return(res) + +}