From 9a9e6c2ccb4bb87eefa2df477e901ebc86fe13c4 Mon Sep 17 00:00:00 2001
From: stevemussmann <smussmann@gmail.com>
Date: Thu, 9 Apr 2026 11:35:12 -0700
Subject: [PATCH 1/2] fix outlier detection bug

---
 R/combine_bgc_output.R | 336 +++++++++++++++---------------
 R/get_bgc_outliers.R   | 451 +++++++++++++++++++++--------------------
 R/plot_traces.R        |  12 ++
 scripts/run_bgc.sh     |   8 +
 4 files changed, 422 insertions(+), 385 deletions(-)
 mode change 100644 => 100755 scripts/run_bgc.sh

diff --git a/R/combine_bgc_output.R b/R/combine_bgc_output.R
index 118628b..9e03505 100644
--- a/R/combine_bgc_output.R
+++ b/R/combine_bgc_output.R
@@ -1,161 +1,175 @@
-#' Aggregate BGC Output
-#'
-#' This function aggregates multiple BGC runs into one file. Runs are joined
-#' by columns and must all contain the same number of post-burnin MCMC
-#' iterations. It assumes that input files are contained in a single directory
-#' and have specific file suffixes. See the README.md file for details on
-#' file suffixes. Files can have population prefixes.
-#'
-#' @param results.dir Path to a directory containing all the BGC results
-#' @param prefix Prefix to the input files; e.g., "population1"
-#' @param thin Thin to every n MCMC samples. E.g. thin=2 cuts samples in half
-#' @param discard Discard first N samples from each BGC run
-#' @return A list of data.frames (each data.frame is a BGC parameter)
-#' @export
-#' @examples
-#' aggregate.results <- combine_bgc_output(results.dir = "./results",
-#'                                         prefix = "population1")
-#'
-#' aggregate.results <- combine_bgc_output(results.dir = "./results",
-#'                                         prefix = "population2",
-#'                                         thin = 2)
-#'
-#' aggregate.results <- combine_bgc_output(results.dir = "./results",
-#'                                         prefix = "pop3",
-#'                                         discard = 2000)
-combine_bgc_output <- function(results.dir,
-                               prefix,
-                               thin = NULL,
-                               discard = NULL){
-  
-  writeLines(paste0("\n\nLoading input files with prefix ", prefix, "...\n"))
-  
-  lnl <- list.files(path = results.dir,
-                    pattern = paste0(prefix, ".*lnl_\\d+$"),
-                    full.names = TRUE)
-  check_if_files(lnl)
-  
-  a0 <- list.files(path = results.dir,
-                   pattern = paste0(prefix, ".*a0_\\d+$"),
-                   full.names = TRUE)
-  check_if_files(a0)
-  
-  b0 <- list.files(path = results.dir,
-                   pattern = paste0(prefix, ".*b0_\\d+$"),
-                   full.names = TRUE)
-  check_if_files(b0)
-  
-  qa <- list.files(path = results.dir,
-                   pattern = paste0(prefix, ".*qa_\\d+$"),
-                   full.names = TRUE)
-  check_if_files(qa)
-  
-  qb <- list.files(path = results.dir,
-                   pattern = paste0(prefix, ".*qb_\\d+$"),
-                   full.names = TRUE)
-  check_if_files(qb)
-  
-  hi <- list.files(path = results.dir,
-                   pattern = paste0(prefix, ".*hi_\\d+$"),
-                   full.names = TRUE)
-  check_if_files(hi)
-  
-  gc()
-  
-  if (!countBGCruns(list(lnl, a0, b0, qa, qb, hi))){
-    stop("Error: All parameters must have the same number of BGC runs!")
-  }
-  
-  if (!is.null(thin)){
-    writeLines(paste0("\n\nThinning MCMC samples with frequency: ", thin))
-  }
-  if (!is.null(discard)){
-    writeLines(paste0("\n\nDiscarding samples as burnin: ", discard))
-  }
-  
-  # Aggregate the BGC runs into one data.frame
-  lnl.df <- bind_bgc_runs(lnl, thin = thin, discard = discard)
-  gc()
-  a0.df <- bind_bgc_runs(a0, thin = thin, discard = discard)
-  gc()
-  b0.df <- bind_bgc_runs(b0, thin = thin, discard = discard)
-  gc()
-  qa.df <- bind_bgc_runs(qa, thin = thin, discard = discard)
-  gc()
-  qb.df <- bind_bgc_runs(qb, thin = thin, discard = discard)
-  gc()
-  hi.df <- bind_bgc_runs(hi, thin = thin, discard = discard)
-  gc()
-  
-  writeLines(paste0("\n\n # MCMC samples = ", ncol(lnl.df)))
-  
-
-writeLines(paste0("\n\nDone! Found ", length(lnl), " BGC runs.\n\n"))
-
-df.list <-
-  list(lnl=lnl.df, a0=a0.df, b0=b0.df, qa=qa.df, qb=qb.df, hi=hi.df)
-
-return(df.list)
-}
-
-#' Validates that input files were found
-#' @param files A vector of file paths
-#' @noRd
-check_if_files <- function(files){
-  # Make sure list.files found the correct input files.
-  if (length(files) == 0){
-    stop(paste0("Error: The files ",
-                files,
-                " were not found in the input directory."))
-  }
-}
-
-#' Combine bgc runs together by binding columns (uses dplyr and data.table)
-#' @param files A vector of file paths
-#' @param thin Boolean. If TRUE, thins MCMC to every nth sample
-#' @noRd
-bind_bgc_runs <- function(files, thin = NULL, discard = NULL){
-  
-  data_list <- lapply(files, data.table::fread, stringsAsFactors = FALSE)
-  
-  if (!is.null(thin)){
-    data_list <- lapply(data_list, function(x) thin_MCMC(x, thin))
-  }
-  if (!is.null(discard)){
-    data_list <- lapply(data_list, function(x) discardNsamples(x, discard))
-  }
-  
-  df <- dplyr::bind_cols(data_list)
-  
-  rm(data_list)
-  gc()
-  return(df)
-}
-
-#' Validate that all parameters have same number of BGC runs
-#' @param l List of filepath vectors
-#' @noRd
-countBGCruns <- function(l){
-  return(length(unique(sapply(l, length))) == 1)
-}
-
-#' Thin MCMC iterations every Nth iteration.
-#' @param df data.frame with each MCMC sample as a column
-#' @param n Only keep every nth iteration
-#' @noRd
-thin_MCMC <- function(df, n){
-  df.new <- df[ ,seq(from = 1, to = ncol(df), by = n)]
-  rm(df)
-  gc()
-  return(df.new)
-}
-
-#' Discard first N samples of each run
-#' @param df data.frame containing
-#' @noRd
-discardNsamples <- function(df, n){
-  df.new <- df[,(n+1):ncol(df)]
-  rm(df)
-  gc()
-  return(df.new)
-}
+#' Aggregate BGC Output
+#'
+#' This function aggregates multiple BGC runs into one file. Runs are joined
+#' by columns and must all contain the same number of post-burnin MCMC
+#' iterations. It assumes that input files are contained in a single directory
+#' and have specific file suffixes. See the README.md file for details on
+#' file suffixes. Files can have population prefixes.
+#'
+#' @param results.dir Path to a directory containing all the BGC results
+#' @param prefix Prefix to the input files; e.g., "population1"
+#' @param thin Thin to every n MCMC samples. E.g. thin=2 cuts samples in half
+#' @param discard Discard first N samples from each BGC run
+#' @return A list of data.frames (each data.frame is a BGC parameter)
+#' @export
+#' @examples
+#' aggregate.results <- combine_bgc_output(results.dir = "./results",
+#'                                         prefix = "population1")
+#'
+#' aggregate.results <- combine_bgc_output(results.dir = "./results",
+#'                                         prefix = "population2",
+#'                                         thin = 2)
+#'
+#' aggregate.results <- combine_bgc_output(results.dir = "./results",
+#'                                         prefix = "pop3",
+#'                                         discard = 2000)
+combine_bgc_output <- function(results.dir,
+                               prefix,
+                               thin = NULL,
+                               discard = NULL){
+  
+  writeLines(paste0("\n\nLoading input files with prefix ", prefix, "...\n"))
+  
+  lnl <- list.files(path = results.dir,
+                    pattern = paste0(prefix, ".*lnl_\\d+$"),
+                    full.names = TRUE)
+  check_if_files(lnl)
+  
+  a0 <- list.files(path = results.dir,
+                   pattern = paste0(prefix, ".*a0_\\d+$"),
+                   full.names = TRUE)
+  check_if_files(a0)
+  
+  b0 <- list.files(path = results.dir,
+                   pattern = paste0(prefix, ".*b0_\\d+$"),
+                   full.names = TRUE)
+  check_if_files(b0)
+  
+  qa <- list.files(path = results.dir,
+                   pattern = paste0(prefix, ".*qa_\\d+$"),
+                   full.names = TRUE)
+  check_if_files(qa)
+  
+  qb <- list.files(path = results.dir,
+                   pattern = paste0(prefix, ".*qb_\\d+$"),
+                   full.names = TRUE)
+  check_if_files(qb)
+  
+  hi <- list.files(path = results.dir,
+                   pattern = paste0(prefix, ".*hi_\\d+$"),
+                   full.names = TRUE)
+  check_if_files(hi)
+  
+  ta <- list.files(path = results.dir,
+                   pattern = paste0(prefix, ".*ta_\\d+$"),
+                   full.names = TRUE)
+  check_if_files(ta)
+  
+  tb <- list.files(path = results.dir,
+                   pattern = paste0(prefix, ".*tb_\\d+$"),
+                   full.names = TRUE)
+  check_if_files(tb)
+  
+  gc()
+  
+  if (!countBGCruns(list(lnl, a0, b0, qa, qb, hi, ta, tb))){
+    stop("Error: All parameters must have the same number of BGC runs!")
+  }
+  
+  if (!is.null(thin)){
+    writeLines(paste0("\n\nThinning MCMC samples with frequency: ", thin))
+  }
+  if (!is.null(discard)){
+    writeLines(paste0("\n\nDiscarding samples as burnin: ", discard))
+  }
+  
+  # Aggregate the BGC runs into one data.frame
+  lnl.df <- bind_bgc_runs(lnl, thin = thin, discard = discard)
+  gc()
+  a0.df <- bind_bgc_runs(a0, thin = thin, discard = discard)
+  gc()
+  b0.df <- bind_bgc_runs(b0, thin = thin, discard = discard)
+  gc()
+  qa.df <- bind_bgc_runs(qa, thin = thin, discard = discard)
+  gc()
+  qb.df <- bind_bgc_runs(qb, thin = thin, discard = discard)
+  gc()
+  hi.df <- bind_bgc_runs(hi, thin = thin, discard = discard)
+  gc()
+  ta.df <- bind_bgc_runs(ta, thin = thin, discard = discard)
+  gc()
+  tb.df <- bind_bgc_runs(tb, thin = thin, discard = discard)
+  gc()
+  
+  writeLines(paste0("\n\n # MCMC samples = ", ncol(lnl.df)))
+  
+
+writeLines(paste0("\n\nDone! Found ", length(lnl), " BGC runs.\n\n"))
+
+df.list <-
+  list(lnl=lnl.df, a0=a0.df, b0=b0.df, qa=qa.df, qb=qb.df, hi=hi.df, ta=ta.df, tb=tb.df)
+
+return(df.list)
+}
+
+#' Validates that input files were found
+#' @param files A vector of file paths
+#' @noRd
+check_if_files <- function(files){
+  # Make sure list.files found the correct input files.
+  if (length(files) == 0){
+    stop(paste0("Error: The files ",
+                files,
+                " were not found in the input directory."))
+  }
+}
+
+#' Combine bgc runs together by binding columns (uses dplyr and data.table)
+#' @param files A vector of file paths
+#' @param thin Boolean. If TRUE, thins MCMC to every nth sample
+#' @noRd
+bind_bgc_runs <- function(files, thin = NULL, discard = NULL){
+  
+  data_list <- lapply(files, data.table::fread, stringsAsFactors = FALSE)
+  
+  if (!is.null(thin)){
+    data_list <- lapply(data_list, function(x) thin_MCMC(x, thin))
+  }
+  if (!is.null(discard)){
+    data_list <- lapply(data_list, function(x) discardNsamples(x, discard))
+  }
+  
+  df <- dplyr::bind_cols(data_list)
+  
+  rm(data_list)
+  gc()
+  return(df)
+}
+
+#' Validate that all parameters have same number of BGC runs
+#' @param l List of filepath vectors
+#' @noRd
+countBGCruns <- function(l){
+  return(length(unique(sapply(l, length))) == 1)
+}
+
+#' Thin MCMC iterations every Nth iteration.
+#' @param df data.frame with each MCMC sample as a column
+#' @param n Only keep every nth iteration
+#' @noRd
+thin_MCMC <- function(df, n){
+  df.new <- df[ ,seq(from = 1, to = ncol(df), by = n)]
+  rm(df)
+  gc()
+  return(df.new)
+}
+
+#' Discard first N samples of each run
+#' @param df data.frame containing
+#' @noRd
+discardNsamples <- function(df, n){
+  df.new <- df[,(n+1):ncol(df)]
+  rm(df)
+  gc()
+  return(df.new)
+}
diff --git a/R/get_bgc_outliers.R b/R/get_bgc_outliers.R
index 2c29957..c1e6801 100644
--- a/R/get_bgc_outliers.R
+++ b/R/get_bgc_outliers.R
@@ -1,224 +1,227 @@
-#' Identify Outlier BGC SNPs Using Two Criteria
-#'
-#' These functions identify alpha and beta BGC outliers using two criteria.
-#' First, if a SNP's 95% credible interval for alpha or beta doesn't overlap
-#' with 0, it is considered to have excess ancestry (positive or negative).
-#' Second, if a SNP's alpha or beta falls outside the quantile interval
-#' qn/2 and (qn-1)/2, it is considered an outlier (positive or negative).
-#' If both criteriea are met for alpha or beta, crazy.a or crazy.b are TRUE.
-#' These criteria are discussed in the Gompert BGC papers.
-#'
-#' Get credible intervals of a BGC parameter
-#' @param df data.frame of one BGC parameter (e.g. alpha)
-#' @return data.frame with credible intervals for the parameter
-#' @noRd
-get_cis <- function(df){
-  # Get credible interval for parameters.
-  cis <- apply(df, 1, function(x) bayestestR::ci(x = x, ci=0.95))
-  cis <- do.call("rbind", cis)
-  param.mean <- rowMeans(as.matrix(df))
-  param.median <- apply(df, 1, function(x) median(x))
-
-  df2 <- data.frame(mean=param.mean,
-                    median=param.median,
-                    lb=cis$CI_low,
-                    ub=cis$CI_high,
-                    stringsAsFactors = FALSE)
-
-  # Clear memory.
-  rm(param.mean, param.median, cis)
-  gc()
-  return(df2)
-}
-
-#' Get alpha and beta outliers
-#' @param a.out data.frame of alpha BGC results
-#' @param b.out data.frame of beta BGC results
-#' @param qa.out data.frame of qa (gamma-quantile) BGC results
-#' @param qb.out data.frame of qb (zeta-quantile) BGC results
-#' @param loci Path to two-column file containing loci names (in BGC order)
-#' @param qn Upper quantile interval value
-#' @return data.frame containing SNP outlier info
-#' @noRd
-get_ab_outliers <- function(a.out, b.out, qa.out, qb.out, loci, qn){
-  # Get excess ancestry and outliers for alpha and beta parameters.
-  names(a.out)[3:4] <- c('lb','ub')
-  names(b.out)[3:4] <- c('lb','ub')
-  names(qa.out)[3:4] <- c('lb','ub')
-  names(qb.out)[3:4] <- c('lb','ub')
-
-  # Excess ancestry compared to genome-wide average introgression
-  # If lb > 0 or ub < 0, it has excess ancestry for P0 or P1, respectively.
-  # Do alpha.
-  a.out$excess <- NA
-  a.out$excess[a.out$lb > 0] <- 'pos'
-  a.out$excess[a.out$ub < 0] <- 'neg'
-
-  # Beta.
-  b.out$excess <- NA
-  b.out$excess[b.out$lb > 0] <- 'pos'
-  b.out$excess[b.out$ub < 0] <- 'neg'
-
-  # Add in quantiles
-  a.out$q <- qa.out$mean
-  b.out$q <- qb.out$mean
-
-  # qnorm takes SD, so take the square root of the quantile estimate.
-  # This basically generates the interval to check if alpha and beta are
-  # outliers.
-  # The interval is defined by qn = n/2 and qn = 1-n/2.
-  # If the median for alpha or beta fall outside that interval, it's an
-  # outlier.
-  # By default, qn is set to 0.025 and 0.975
-  qn_lower <- 1 - qn
-
-  a.out$qlb <- qnorm(qn_lower,0,sqrt(a.out$q))
-  a.out$qub <- qnorm(qn,0,sqrt(a.out$q))
-
-  b.out$qlb <- qnorm(qn_lower,0,sqrt(b.out$q))
-  b.out$qub <- qnorm(qn,0,sqrt(b.out$q))
-
-  # Check if median falls outside qn interval.
-  a.out$outlier <- NA
-  a.out$outlier[a.out$median > a.out$qub] <- 'pos'
-  a.out$outlier[a.out$median < a.out$qlb] <- 'neg'
-
-  b.out$outlier <- NA
-  b.out$outlier[b.out$median > b.out$qub] <- 'pos'
-  b.out$outlier[b.out$median < b.out$qlb] <- 'neg'
-
-  # Read loci file generated using my vcf2bgc.py script.
-  # Contains actual scaffold names.
-  snps <- read.delim(loci, sep = " ", stringsAsFactors = FALSE)
-
-  if (nrow(snps) != nrow(a.out)){
-    writeLines(paste0("\n\nError: There are ", nrow(snps), " loci "))
-    writeLines("in the loci.file")
-    stop("loci.file length must equal rows in the BGC output")
-  }
-
-  ### Copy the data to snps data.frame.
-  # Mean a and b
-  snps$alpha <- a.out$mean
-  snps$beta <- b.out$mean
-
-  # Excess ancestry and outliers.
-  snps$alpha.excess <- a.out$excess
-  snps$beta.excess <- b.out$excess
-  snps$alpha.outlier <- a.out$outlier
-  snps$beta.outlier <- b.out$outlier
-
-  # If has both excess ancestry AND is an outlier.
-  snps$crazy.a <- !is.na(snps$alpha.excess) & !is.na(snps$alpha.outlier)
-  snps$crazy.b <- !is.na(snps$beta.excess) & !is.na(snps$beta.outlier)
-
-  # Clear memory.
-  rm(a.out, b.out, qa.out, qb.out, loci)
-  gc()
-
-  return(snps)
-}
-
-#' Gets hybrid index mean across MCMC iterations for each individual
-#' @param df data.frame of BGC hybrid index results
-#' @return Matrix containing MCMC hybrid index means
-#' @noRd
-get_hi <- function(df){
-  hi.mean <- rowMeans(as.matrix(df))
-  return(hi.mean)
-}
-
-#' Parses BGC output and identifies outlier SNPs
-#' @param df.list List of data.frame objects from combine_bgc_output()
-#' @param admix.pop String identifying admixed population in population map
-#'                  file
-#' @param popmap Population map file consisting of two tab-separated columns:
-#'               indID   popID (no header line)
-#' @param qn Upper quantile interval value for determining outlier SNPs.
-#'           Default = 0.975
-#' @param loci.file Path to two-column tab-separated file containing loci names
-#'                  (in same order as BGC input). Column 1 should be locus
-#'                  name, column2 should be the position of the SNP.
-#'                  header line should be: #CHROM POS
-#' @param save.obj File path. If supplied, saves outlier info as .RDS object.
-#'                 The value for save.obj should be the filename for the
-#'                 RDS object
-#' @return List containing 3 data.frame objects. 1) outlier info,
-#'         2)popmap info, 3) hybrid index info
-#' @export
-#' @examples
-#' outliers <- get_bgc_outliers(df.list = aggregated.results,
-#'                              admix.pop = "population1",
-#'                              popmap = "./popmap.txt",
-#'                              qn = 0.975,
-#'                              loci.file = "./bgc_loci.txt",
-#'                              save.obj = "./outlierInfo.rds")
-get_bgc_outliers <- function(df.list,
-                             admix.pop,
-                             popmap,
-                             qn = 0.975,
-                             loci.file = NULL,
-                             save.obj = NULL){
-
-  # Get 95% credible intervals for alpha, beta, gamma-quantile (qa),
-  # zeta-quantile (qb).
-  a <- get_cis(df.list[[2]])
-  gc()
-  b <- get_cis(df.list[[3]])
-  gc()
-  qa <- get_cis(df.list[[4]])
-  gc()
-  qb <- get_cis(df.list[[5]])
-  gc()
-
-  clean=FALSE
-  #if no loci file provided, make a spoof one here
-  if (is.null(loci.file)){
-    clean=TRUE
-    spoof<-data.frame("#CHROM" = 0:((nrow(df.list[[2]][,1])-1)), "POS"=0:((nrow(df.list[[2]][,1])-1)), check.names = FALSE)
-    readr::write_delim(spoof, "loci_map.txt", delim=" ", col_names=TRUE)
-    loci.file="loci_map.txt"
-  }
-
-  if (!file.exists(loci.file)){
-    stop(paste0("Error: The loci file ", loci.file, " does not exist!"))
-  }
-
-  if (!file.exists(popmap)){
-    stop(paste0("Error: The popmap file ", popmap, " does not exist!"))
-  }
-
-  # Find SNPs with excess ancestry and outliers SNPs.
-  snps <- get_ab_outliers(a, b, qa, qb, loci.file, qn)
-  gc()
-
-  # Get hybrid indexes.
-  hi <- get_hi(df.list[[6]])
-
-  # Read in popmap.
-  # Tab-separated, two-columns: indID\tpopID
-  popmap <- read.table(file = popmap, stringsAsFactors = FALSE)
-
-  # Get data.frame with indIDs of admixed individuals and hybrid index.
-  admix <- popmap[ which(popmap$V2 == admix.pop),]
-  hi.df <- data.frame(id=admix$V1, hi=hi, stringsAsFactors = FALSE)
-
-  res <- list(snps, popmap, hi.df)
-
-  rm(a, b, qa, qb, snps, hi, popmap, admix, hi.df)
-  gc()
-  # If user specified filepath: save res as rds object.
-  if(!is.null(save.obj)){
-    saveRDS(res, save.obj)
-  }
-
-  gc()
-
-  #clean up spoofed loci_map
-  if (clean == TRUE){
-    file.remove(loci.file)
-  }
-
-  return(res)
-
-}
+#' Identify Outlier BGC SNPs Using Two Criteria
+#'
+#' These functions identify alpha and beta BGC outliers using two criteria.
+#' First, if a SNP's 95% credible interval for alpha or beta doesn't overlap
+#' with 0, it is considered to have excess ancestry (positive or negative).
+#' Second, if a SNP's alpha or beta falls outside the quantile interval
+#' qn/2 and (qn-1)/2, it is considered an outlier (positive or negative).
+#' If both criteriea are met for alpha or beta, crazy.a or crazy.b are TRUE.
+#' These criteria are discussed in the Gompert BGC papers.
+#'
+#' Get credible intervals of a BGC parameter
+#' @param df data.frame of one BGC parameter (e.g. alpha)
+#' @return data.frame with credible intervals for the parameter
+#' @noRd
+get_cis <- function(df){
+  # Get credible interval for parameters.
+  cis <- apply(df, 1, function(x) bayestestR::ci(x = x, ci=0.95))
+  cis <- do.call("rbind", cis)
+  param.mean <- rowMeans(as.matrix(df))
+  param.median <- apply(df, 1, function(x) median(x))
+
+  df2 <- data.frame(mean=param.mean,
+                    median=param.median,
+                    lb=cis$CI_low,
+                    ub=cis$CI_high,
+                    stringsAsFactors = FALSE)
+
+  # Clear memory.
+  rm(param.mean, param.median, cis)
+  gc()
+  return(df2)
+}
+
+#' Get alpha and beta outliers
+#' @param a.out data.frame of alpha BGC results
+#' @param b.out data.frame of beta BGC results
+#' @param qa.out data.frame of qa (gamma-quantile) BGC results
+#' @param qb.out data.frame of qb (zeta-quantile) BGC results
+#' @param loci Path to two-column file containing loci names (in BGC order)
+#' @param qn Upper quantile interval value
+#' @return data.frame containing SNP outlier info
+#' @noRd
+get_ab_outliers <- function(a.out, b.out, qa.out, qb.out, loci, qn, ta, tb){
+  # Get excess ancestry and outliers for alpha and beta parameters.
+  names(a.out)[3:4] <- c('lb','ub')
+  names(b.out)[3:4] <- c('lb','ub')
+  names(qa.out)[3:4] <- c('lb','ub')
+  names(qb.out)[3:4] <- c('lb','ub')
+
+  # Excess ancestry compared to genome-wide average introgression
+  # If lb > 0 or ub < 0, it has excess ancestry for P0 or P1, respectively.
+  # Do alpha.
+  a.out$excess <- NA
+  a.out$excess[a.out$lb > 0] <- 'pos'
+  a.out$excess[a.out$ub < 0] <- 'neg'
+
+  # Beta.
+  b.out$excess <- NA
+  b.out$excess[b.out$lb > 0] <- 'pos'
+  b.out$excess[b.out$ub < 0] <- 'neg'
+
+  # Add in quantiles
+  a.out$q <- qa.out$mean
+  b.out$q <- qb.out$mean
+  
+  # qnorm takes SD, so take square root of the reciprocal of tau.
+  # This basically generates the interval to check if alpha and beta are
+  # outliers.
+  # The interval is defined by qn = n/2 and qn = 1-n/2.
+  # If the median for alpha or beta fall outside that interval, it's an
+  # outlier.
+  # By default, qn is set to 0.025 and 0.975
+  qn_lower <- 1 - qn
+
+  a.out$qlb <- qnorm(qn_lower,0,sqrt(1/ta))
+  a.out$qub <- qnorm(qn,0,sqrt(1/ta))
+  
+  b.out$qlb <- qnorm(qn_lower,0,sqrt(1/tb))
+  b.out$qub <- qnorm(qn,0,sqrt(1/tb))
+  
+  
+  # Check if median falls outside qn interval.
+  a.out$outlier <- NA
+  a.out$outlier[a.out$median > a.out$qub] <- 'pos'
+  a.out$outlier[a.out$median < a.out$qlb] <- 'neg'
+
+  b.out$outlier <- NA
+  b.out$outlier[b.out$median > b.out$qub] <- 'pos'
+  b.out$outlier[b.out$median < b.out$qlb] <- 'neg'
+
+  # Read loci file generated using my vcf2bgc.py script.
+  # Contains actual scaffold names.
+  snps <- read.delim(loci, sep = " ", stringsAsFactors = FALSE)
+
+  if (nrow(snps) != nrow(a.out)){
+    writeLines(paste0("\n\nError: There are ", nrow(snps), " loci "))
+    writeLines("in the loci.file")
+    stop("loci.file length must equal rows in the BGC output")
+  }
+
+  ### Copy the data to snps data.frame.
+  # Mean a and b
+  snps$alpha <- a.out$mean
+  snps$beta <- b.out$mean
+
+  # Excess ancestry and outliers.
+  snps$alpha.excess <- a.out$excess
+  snps$beta.excess <- b.out$excess
+  snps$alpha.outlier <- a.out$outlier
+  snps$beta.outlier <- b.out$outlier
+
+  # If has both excess ancestry AND is an outlier.
+  snps$crazy.a <- !is.na(snps$alpha.excess) & !is.na(snps$alpha.outlier)
+  snps$crazy.b <- !is.na(snps$beta.excess) & !is.na(snps$beta.outlier)
+
+  # Clear memory.
+  rm(a.out, b.out, qa.out, qb.out, loci)
+  gc()
+
+  return(snps)
+}
+
+#' Gets hybrid index mean across MCMC iterations for each individual
+#' @param df data.frame of BGC hybrid index results
+#' @return Matrix containing MCMC hybrid index means
+#' @noRd
+get_hi <- function(df){
+  hi.mean <- rowMeans(as.matrix(df))
+  return(hi.mean)
+}
+
+#' Parses BGC output and identifies outlier SNPs
+#' @param df.list List of data.frame objects from combine_bgc_output()
+#' @param admix.pop String identifying admixed population in population map
+#'                  file
+#' @param popmap Population map file consisting of two tab-separated columns:
+#'               indID   popID (no header line)
+#' @param qn Upper quantile interval value for determining outlier SNPs.
+#'           Default = 0.975
+#' @param loci.file Path to two-column tab-separated file containing loci names
+#'                  (in same order as BGC input). Column 1 should be locus
+#'                  name, column2 should be the position of the SNP.
+#'                  header line should be: #CHROM POS
+#' @param save.obj File path. If supplied, saves outlier info as .RDS object.
+#'                 The value for save.obj should be the filename for the
+#'                 RDS object
+#' @return List containing 3 data.frame objects. 1) outlier info,
+#'         2)popmap info, 3) hybrid index info
+#' @export
+#' @examples
+#' outliers <- get_bgc_outliers(df.list = aggregated.results,
+#'                              admix.pop = "population1",
+#'                              popmap = "./popmap.txt",
+#'                              qn = 0.975,
+#'                              loci.file = "./bgc_loci.txt",
+#'                              save.obj = "./outlierInfo.rds")
+get_bgc_outliers <- function(df.list,
+                             admix.pop,
+                             popmap,
+                             qn = 0.975,
+                             loci.file = NULL,
+                             save.obj = NULL){
+
+  # Get 95% credible intervals for alpha, beta, gamma-quantile (qa),
+  # zeta-quantile (qb).
+  a <- get_cis(df.list[[2]])
+  gc()
+  b <- get_cis(df.list[[3]])
+  gc()
+  qa <- get_cis(df.list[[4]])
+  gc()
+  qb <- get_cis(df.list[[5]])
+  gc()
+  ta <- mean(unlist(bgc.genes[["ta"]])) # get estimate of Tau-alpha
+  tb <- mean(unlist(bgc.genes[["tb"]])) # get estimate of Tau-beta
+
+  clean=FALSE
+  #if no loci file provided, make a spoof one here
+  if (is.null(loci.file)){
+    clean=TRUE
+    spoof<-data.frame("#CHROM" = 0:((nrow(df.list[[2]][,1])-1)), "POS"=0:((nrow(df.list[[2]][,1])-1)), check.names = FALSE)
+    readr::write_delim(spoof, "loci_map.txt", delim=" ", col_names=TRUE)
+    loci.file="loci_map.txt"
+  }
+
+  if (!file.exists(loci.file)){
+    stop(paste0("Error: The loci file ", loci.file, " does not exist!"))
+  }
+
+  if (!file.exists(popmap)){
+    stop(paste0("Error: The popmap file ", popmap, " does not exist!"))
+  }
+
+  # Find SNPs with excess ancestry and outliers SNPs.
+  snps <- get_ab_outliers(a, b, qa, qb, loci.file, qn, ta, tb)
+  gc()
+
+  # Get hybrid indexes.
+  hi <- get_hi(df.list[[6]])
+
+  # Read in popmap.
+  # Tab-separated, two-columns: indID\tpopID
+  popmap <- read.table(file = popmap, stringsAsFactors = FALSE)
+
+  # Get data.frame with indIDs of admixed individuals and hybrid index.
+  admix <- popmap[ which(popmap$V2 == admix.pop),]
+  hi.df <- data.frame(id=admix$V1, hi=hi, stringsAsFactors = FALSE)
+
+  res <- list(snps, popmap, hi.df)
+
+  rm(a, b, qa, qb, snps, hi, popmap, admix, hi.df)
+  gc()
+  # If user specified filepath: save res as rds object.
+  if(!is.null(save.obj)){
+    saveRDS(res, save.obj)
+  }
+
+  gc()
+
+  #clean up spoofed loci_map
+  if (clean == TRUE){
+    file.remove(loci.file)
+  }
+
+  return(res)
+
+}
diff --git a/R/plot_traces.R b/R/plot_traces.R
index fd9de03..2075f04 100644
--- a/R/plot_traces.R
+++ b/R/plot_traces.R
@@ -50,6 +50,18 @@ plot_traces <- function(df.list, prefix, plotDIR = "./plots", showPLOTS=FALSE){
                 prefix = prefix,
                 plotDIR = plotDIR,
                 showPLOTS=showPLOTS)
+  plotBGCparams(df = df.list[[7]],
+                ylab = "Tau-alpha",
+                bgc_param = "ta",
+                prefix = prefix,
+                plotDIR = plotDIR,
+                showPLOTS=showPLOTS)
+  plotBGCparams(df = df.list[[8]],
+                ylab = "Tau-beta",
+                bgc_param = "tb",
+                prefix = prefix,
+                plotDIR = plotDIR,
+                showPLOTS=showPLOTS)
 
   writeLines(paste("Saved trace plots to", plotDIR))
 
diff --git a/scripts/run_bgc.sh b/scripts/run_bgc.sh
old mode 100644
new mode 100755
index 794d8ae..e3524a1
--- a/scripts/run_bgc.sh
+++ b/scripts/run_bgc.sh
@@ -199,6 +199,14 @@ $ESTPOST_PATH/estpost -i ${RESULTS_DIR}/${prefix}_mcmcout_${run}.hdf5 \
 $ESTPOST_PATH/estpost -i ${RESULTS_DIR}/${prefix}_mcmcout_${run}.hdf5 \
 	-p zeta-quantile -o ${RESULTS_DIR}/${prefix}_stat_qb_${run} -s 2 -w 0
 
+# Get tau-alpha (ta)
+$ESTPOST_PATH/estpost -i ${RESULTS_DIR}/${prefix}_mcmcout_${run}.hdf5 \
+	-p tau-alpha -o ${RESULTS_DIR}/${prefix}_stat_ta_${run} -s 2 -w 0
+
+# Get tau-beta (tb)
+$ESTPOST_PATH/estpost -i ${RESULTS_DIR}/${prefix}_mcmcout_${run}.hdf5 \
+	-p tau-beta -o ${RESULTS_DIR}/${prefix}_stat_tb_${run} -s 2 -w 0
+
 echo "Done with estpost!"
 
 exit 0

From 1a9129617a33c7e0bb1fa0145bed55d8d768456c Mon Sep 17 00:00:00 2001
From: stevemussmann <smussmann@gmail.com>
Date: Thu, 9 Apr 2026 11:37:38 -0700
Subject: [PATCH 2/2] unix line breaks

---
 R/combine_bgc_output.R | 350 +++++++++++++++----------------
 R/get_bgc_outliers.R   | 454 ++++++++++++++++++++---------------------
 2 files changed, 402 insertions(+), 402 deletions(-)

diff --git a/R/combine_bgc_output.R b/R/combine_bgc_output.R
index 9e03505..05ef810 100644
--- a/R/combine_bgc_output.R
+++ b/R/combine_bgc_output.R
@@ -1,175 +1,175 @@
-#' Aggregate BGC Output
-#'
-#' This function aggregates multiple BGC runs into one file. Runs are joined
-#' by columns and must all contain the same number of post-burnin MCMC
-#' iterations. It assumes that input files are contained in a single directory
-#' and have specific file suffixes. See the README.md file for details on
-#' file suffixes. Files can have population prefixes.
-#'
-#' @param results.dir Path to a directory containing all the BGC results
-#' @param prefix Prefix to the input files; e.g., "population1"
-#' @param thin Thin to every n MCMC samples. E.g. thin=2 cuts samples in half
-#' @param discard Discard first N samples from each BGC run
-#' @return A list of data.frames (each data.frame is a BGC parameter)
-#' @export
-#' @examples
-#' aggregate.results <- combine_bgc_output(results.dir = "./results",
-#'                                         prefix = "population1")
-#'
-#' aggregate.results <- combine_bgc_output(results.dir = "./results",
-#'                                         prefix = "population2",
-#'                                         thin = 2)
-#'
-#' aggregate.results <- combine_bgc_output(results.dir = "./results",
-#'                                         prefix = "pop3",
-#'                                         discard = 2000)
-combine_bgc_output <- function(results.dir,
-                               prefix,
-                               thin = NULL,
-                               discard = NULL){
-  
-  writeLines(paste0("\n\nLoading input files with prefix ", prefix, "...\n"))
-  
-  lnl <- list.files(path = results.dir,
-                    pattern = paste0(prefix, ".*lnl_\\d+$"),
-                    full.names = TRUE)
-  check_if_files(lnl)
-  
-  a0 <- list.files(path = results.dir,
-                   pattern = paste0(prefix, ".*a0_\\d+$"),
-                   full.names = TRUE)
-  check_if_files(a0)
-  
-  b0 <- list.files(path = results.dir,
-                   pattern = paste0(prefix, ".*b0_\\d+$"),
-                   full.names = TRUE)
-  check_if_files(b0)
-  
-  qa <- list.files(path = results.dir,
-                   pattern = paste0(prefix, ".*qa_\\d+$"),
-                   full.names = TRUE)
-  check_if_files(qa)
-  
-  qb <- list.files(path = results.dir,
-                   pattern = paste0(prefix, ".*qb_\\d+$"),
-                   full.names = TRUE)
-  check_if_files(qb)
-  
-  hi <- list.files(path = results.dir,
-                   pattern = paste0(prefix, ".*hi_\\d+$"),
-                   full.names = TRUE)
-  check_if_files(hi)
-  
-  ta <- list.files(path = results.dir,
-                   pattern = paste0(prefix, ".*ta_\\d+$"),
-                   full.names = TRUE)
-  check_if_files(ta)
-  
-  tb <- list.files(path = results.dir,
-                   pattern = paste0(prefix, ".*tb_\\d+$"),
-                   full.names = TRUE)
-  check_if_files(tb)
-  
-  gc()
-  
-  if (!countBGCruns(list(lnl, a0, b0, qa, qb, hi, ta, tb))){
-    stop("Error: All parameters must have the same number of BGC runs!")
-  }
-  
-  if (!is.null(thin)){
-    writeLines(paste0("\n\nThinning MCMC samples with frequency: ", thin))
-  }
-  if (!is.null(discard)){
-    writeLines(paste0("\n\nDiscarding samples as burnin: ", discard))
-  }
-  
-  # Aggregate the BGC runs into one data.frame
-  lnl.df <- bind_bgc_runs(lnl, thin = thin, discard = discard)
-  gc()
-  a0.df <- bind_bgc_runs(a0, thin = thin, discard = discard)
-  gc()
-  b0.df <- bind_bgc_runs(b0, thin = thin, discard = discard)
-  gc()
-  qa.df <- bind_bgc_runs(qa, thin = thin, discard = discard)
-  gc()
-  qb.df <- bind_bgc_runs(qb, thin = thin, discard = discard)
-  gc()
-  hi.df <- bind_bgc_runs(hi, thin = thin, discard = discard)
-  gc()
-  ta.df <- bind_bgc_runs(ta, thin = thin, discard = discard)
-  gc()
-  tb.df <- bind_bgc_runs(tb, thin = thin, discard = discard)
-  gc()
-  
-  writeLines(paste0("\n\n # MCMC samples = ", ncol(lnl.df)))
-  
-
-writeLines(paste0("\n\nDone! Found ", length(lnl), " BGC runs.\n\n"))
-
-df.list <-
-  list(lnl=lnl.df, a0=a0.df, b0=b0.df, qa=qa.df, qb=qb.df, hi=hi.df, ta=ta.df, tb=tb.df)
-
-return(df.list)
-}
-
-#' Validates that input files were found
-#' @param files A vector of file paths
-#' @noRd
-check_if_files <- function(files){
-  # Make sure list.files found the correct input files.
-  if (length(files) == 0){
-    stop(paste0("Error: The files ",
-                files,
-                " were not found in the input directory."))
-  }
-}
-
-#' Combine bgc runs together by binding columns (uses dplyr and data.table)
-#' @param files A vector of file paths
-#' @param thin Boolean. If TRUE, thins MCMC to every nth sample
-#' @noRd
-bind_bgc_runs <- function(files, thin = NULL, discard = NULL){
-  
-  data_list <- lapply(files, data.table::fread, stringsAsFactors = FALSE)
-  
-  if (!is.null(thin)){
-    data_list <- lapply(data_list, function(x) thin_MCMC(x, thin))
-  }
-  if (!is.null(discard)){
-    data_list <- lapply(data_list, function(x) discardNsamples(x, discard))
-  }
-  
-  df <- dplyr::bind_cols(data_list)
-  
-  rm(data_list)
-  gc()
-  return(df)
-}
-
-#' Validate that all parameters have same number of BGC runs
-#' @param l List of filepath vectors
-#' @noRd
-countBGCruns <- function(l){
-  return(length(unique(sapply(l, length))) == 1)
-}
-
-#' Thin MCMC iterations every Nth iteration.
-#' @param df data.frame with each MCMC sample as a column
-#' @param n Only keep every nth iteration
-#' @noRd
-thin_MCMC <- function(df, n){
-  df.new <- df[ ,seq(from = 1, to = ncol(df), by = n)]
-  rm(df)
-  gc()
-  return(df.new)
-}
-
-#' Discard first N samples of each run
-#' @param df data.frame containing
-#' @noRd
-discardNsamples <- function(df, n){
-  df.new <- df[,(n+1):ncol(df)]
-  rm(df)
-  gc()
-  return(df.new)
-}
+#' Aggregate BGC Output
+#'
+#' This function aggregates multiple BGC runs into one file. Runs are joined
+#' by columns and must all contain the same number of post-burnin MCMC
+#' iterations. It assumes that input files are contained in a single directory
+#' and have specific file suffixes. See the README.md file for details on
+#' file suffixes. Files can have population prefixes.
+#'
+#' @param results.dir Path to a directory containing all the BGC results
+#' @param prefix Prefix to the input files; e.g., "population1"
+#' @param thin Thin to every n MCMC samples. E.g. thin=2 cuts samples in half
+#' @param discard Discard first N samples from each BGC run
+#' @return A list of data.frames (each data.frame is a BGC parameter)
+#' @export
+#' @examples
+#' aggregate.results <- combine_bgc_output(results.dir = "./results",
+#'                                         prefix = "population1")
+#'
+#' aggregate.results <- combine_bgc_output(results.dir = "./results",
+#'                                         prefix = "population2",
+#'                                         thin = 2)
+#'
+#' aggregate.results <- combine_bgc_output(results.dir = "./results",
+#'                                         prefix = "pop3",
+#'                                         discard = 2000)
+combine_bgc_output <- function(results.dir,
+                               prefix,
+                               thin = NULL,
+                               discard = NULL){
+  
+  writeLines(paste0("\n\nLoading input files with prefix ", prefix, "...\n"))
+  
+  lnl <- list.files(path = results.dir,
+                    pattern = paste0(prefix, ".*lnl_\\d+$"),
+                    full.names = TRUE)
+  check_if_files(lnl)
+  
+  a0 <- list.files(path = results.dir,
+                   pattern = paste0(prefix, ".*a0_\\d+$"),
+                   full.names = TRUE)
+  check_if_files(a0)
+  
+  b0 <- list.files(path = results.dir,
+                   pattern = paste0(prefix, ".*b0_\\d+$"),
+                   full.names = TRUE)
+  check_if_files(b0)
+  
+  qa <- list.files(path = results.dir,
+                   pattern = paste0(prefix, ".*qa_\\d+$"),
+                   full.names = TRUE)
+  check_if_files(qa)
+  
+  qb <- list.files(path = results.dir,
+                   pattern = paste0(prefix, ".*qb_\\d+$"),
+                   full.names = TRUE)
+  check_if_files(qb)
+  
+  hi <- list.files(path = results.dir,
+                   pattern = paste0(prefix, ".*hi_\\d+$"),
+                   full.names = TRUE)
+  check_if_files(hi)
+  
+  ta <- list.files(path = results.dir,
+                   pattern = paste0(prefix, ".*ta_\\d+$"),
+                   full.names = TRUE)
+  check_if_files(ta)
+  
+  tb <- list.files(path = results.dir,
+                   pattern = paste0(prefix, ".*tb_\\d+$"),
+                   full.names = TRUE)
+  check_if_files(tb)
+  
+  gc()
+  
+  if (!countBGCruns(list(lnl, a0, b0, qa, qb, hi, ta, tb))){
+    stop("Error: All parameters must have the same number of BGC runs!")
+  }
+  
+  if (!is.null(thin)){
+    writeLines(paste0("\n\nThinning MCMC samples with frequency: ", thin))
+  }
+  if (!is.null(discard)){
+    writeLines(paste0("\n\nDiscarding samples as burnin: ", discard))
+  }
+  
+  # Aggregate the BGC runs into one data.frame
+  lnl.df <- bind_bgc_runs(lnl, thin = thin, discard = discard)
+  gc()
+  a0.df <- bind_bgc_runs(a0, thin = thin, discard = discard)
+  gc()
+  b0.df <- bind_bgc_runs(b0, thin = thin, discard = discard)
+  gc()
+  qa.df <- bind_bgc_runs(qa, thin = thin, discard = discard)
+  gc()
+  qb.df <- bind_bgc_runs(qb, thin = thin, discard = discard)
+  gc()
+  hi.df <- bind_bgc_runs(hi, thin = thin, discard = discard)
+  gc()
+  ta.df <- bind_bgc_runs(ta, thin = thin, discard = discard)
+  gc()
+  tb.df <- bind_bgc_runs(tb, thin = thin, discard = discard)
+  gc()
+  
+  writeLines(paste0("\n\n # MCMC samples = ", ncol(lnl.df)))
+  
+
+writeLines(paste0("\n\nDone! Found ", length(lnl), " BGC runs.\n\n"))
+
+df.list <-
+  list(lnl=lnl.df, a0=a0.df, b0=b0.df, qa=qa.df, qb=qb.df, hi=hi.df, ta=ta.df, tb=tb.df)
+
+return(df.list)
+}
+
+#' Validates that input files were found
+#' @param files A vector of file paths
+#' @noRd
+check_if_files <- function(files){
+  # Make sure list.files found the correct input files.
+  if (length(files) == 0){
+    stop(paste0("Error: The files ",
+                files,
+                " were not found in the input directory."))
+  }
+}
+
+#' Combine bgc runs together by binding columns (uses dplyr and data.table)
+#' @param files A vector of file paths
+#' @param thin Boolean. If TRUE, thins MCMC to every nth sample
+#' @noRd
+bind_bgc_runs <- function(files, thin = NULL, discard = NULL){
+  
+  data_list <- lapply(files, data.table::fread, stringsAsFactors = FALSE)
+  
+  if (!is.null(thin)){
+    data_list <- lapply(data_list, function(x) thin_MCMC(x, thin))
+  }
+  if (!is.null(discard)){
+    data_list <- lapply(data_list, function(x) discardNsamples(x, discard))
+  }
+  
+  df <- dplyr::bind_cols(data_list)
+  
+  rm(data_list)
+  gc()
+  return(df)
+}
+
+#' Validate that all parameters have same number of BGC runs
+#' @param l List of filepath vectors
+#' @noRd
+countBGCruns <- function(l){
+  return(length(unique(sapply(l, length))) == 1)
+}
+
+#' Thin MCMC iterations every Nth iteration.
+#' @param df data.frame with each MCMC sample as a column
+#' @param n Only keep every nth iteration
+#' @noRd
+thin_MCMC <- function(df, n){
+  df.new <- df[ ,seq(from = 1, to = ncol(df), by = n)]
+  rm(df)
+  gc()
+  return(df.new)
+}
+
+#' Discard first N samples of each run
+#' @param df data.frame containing
+#' @noRd
+discardNsamples <- function(df, n){
+  df.new <- df[,(n+1):ncol(df)]
+  rm(df)
+  gc()
+  return(df.new)
+}
diff --git a/R/get_bgc_outliers.R b/R/get_bgc_outliers.R
index c1e6801..71a73d5 100644
--- a/R/get_bgc_outliers.R
+++ b/R/get_bgc_outliers.R
@@ -1,227 +1,227 @@
-#' Identify Outlier BGC SNPs Using Two Criteria
-#'
-#' These functions identify alpha and beta BGC outliers using two criteria.
-#' First, if a SNP's 95% credible interval for alpha or beta doesn't overlap
-#' with 0, it is considered to have excess ancestry (positive or negative).
-#' Second, if a SNP's alpha or beta falls outside the quantile interval
-#' qn/2 and (qn-1)/2, it is considered an outlier (positive or negative).
-#' If both criteriea are met for alpha or beta, crazy.a or crazy.b are TRUE.
-#' These criteria are discussed in the Gompert BGC papers.
-#'
-#' Get credible intervals of a BGC parameter
-#' @param df data.frame of one BGC parameter (e.g. alpha)
-#' @return data.frame with credible intervals for the parameter
-#' @noRd
-get_cis <- function(df){
-  # Get credible interval for parameters.
-  cis <- apply(df, 1, function(x) bayestestR::ci(x = x, ci=0.95))
-  cis <- do.call("rbind", cis)
-  param.mean <- rowMeans(as.matrix(df))
-  param.median <- apply(df, 1, function(x) median(x))
-
-  df2 <- data.frame(mean=param.mean,
-                    median=param.median,
-                    lb=cis$CI_low,
-                    ub=cis$CI_high,
-                    stringsAsFactors = FALSE)
-
-  # Clear memory.
-  rm(param.mean, param.median, cis)
-  gc()
-  return(df2)
-}
-
-#' Get alpha and beta outliers
-#' @param a.out data.frame of alpha BGC results
-#' @param b.out data.frame of beta BGC results
-#' @param qa.out data.frame of qa (gamma-quantile) BGC results
-#' @param qb.out data.frame of qb (zeta-quantile) BGC results
-#' @param loci Path to two-column file containing loci names (in BGC order)
-#' @param qn Upper quantile interval value
-#' @return data.frame containing SNP outlier info
-#' @noRd
-get_ab_outliers <- function(a.out, b.out, qa.out, qb.out, loci, qn, ta, tb){
-  # Get excess ancestry and outliers for alpha and beta parameters.
-  names(a.out)[3:4] <- c('lb','ub')
-  names(b.out)[3:4] <- c('lb','ub')
-  names(qa.out)[3:4] <- c('lb','ub')
-  names(qb.out)[3:4] <- c('lb','ub')
-
-  # Excess ancestry compared to genome-wide average introgression
-  # If lb > 0 or ub < 0, it has excess ancestry for P0 or P1, respectively.
-  # Do alpha.
-  a.out$excess <- NA
-  a.out$excess[a.out$lb > 0] <- 'pos'
-  a.out$excess[a.out$ub < 0] <- 'neg'
-
-  # Beta.
-  b.out$excess <- NA
-  b.out$excess[b.out$lb > 0] <- 'pos'
-  b.out$excess[b.out$ub < 0] <- 'neg'
-
-  # Add in quantiles
-  a.out$q <- qa.out$mean
-  b.out$q <- qb.out$mean
-  
-  # qnorm takes SD, so take square root of the reciprocal of tau.
-  # This basically generates the interval to check if alpha and beta are
-  # outliers.
-  # The interval is defined by qn = n/2 and qn = 1-n/2.
-  # If the median for alpha or beta fall outside that interval, it's an
-  # outlier.
-  # By default, qn is set to 0.025 and 0.975
-  qn_lower <- 1 - qn
-
-  a.out$qlb <- qnorm(qn_lower,0,sqrt(1/ta))
-  a.out$qub <- qnorm(qn,0,sqrt(1/ta))
-  
-  b.out$qlb <- qnorm(qn_lower,0,sqrt(1/tb))
-  b.out$qub <- qnorm(qn,0,sqrt(1/tb))
-  
-  
-  # Check if median falls outside qn interval.
-  a.out$outlier <- NA
-  a.out$outlier[a.out$median > a.out$qub] <- 'pos'
-  a.out$outlier[a.out$median < a.out$qlb] <- 'neg'
-
-  b.out$outlier <- NA
-  b.out$outlier[b.out$median > b.out$qub] <- 'pos'
-  b.out$outlier[b.out$median < b.out$qlb] <- 'neg'
-
-  # Read loci file generated using my vcf2bgc.py script.
-  # Contains actual scaffold names.
-  snps <- read.delim(loci, sep = " ", stringsAsFactors = FALSE)
-
-  if (nrow(snps) != nrow(a.out)){
-    writeLines(paste0("\n\nError: There are ", nrow(snps), " loci "))
-    writeLines("in the loci.file")
-    stop("loci.file length must equal rows in the BGC output")
-  }
-
-  ### Copy the data to snps data.frame.
-  # Mean a and b
-  snps$alpha <- a.out$mean
-  snps$beta <- b.out$mean
-
-  # Excess ancestry and outliers.
-  snps$alpha.excess <- a.out$excess
-  snps$beta.excess <- b.out$excess
-  snps$alpha.outlier <- a.out$outlier
-  snps$beta.outlier <- b.out$outlier
-
-  # If has both excess ancestry AND is an outlier.
-  snps$crazy.a <- !is.na(snps$alpha.excess) & !is.na(snps$alpha.outlier)
-  snps$crazy.b <- !is.na(snps$beta.excess) & !is.na(snps$beta.outlier)
-
-  # Clear memory.
-  rm(a.out, b.out, qa.out, qb.out, loci)
-  gc()
-
-  return(snps)
-}
-
-#' Gets hybrid index mean across MCMC iterations for each individual
-#' @param df data.frame of BGC hybrid index results
-#' @return Matrix containing MCMC hybrid index means
-#' @noRd
-get_hi <- function(df){
-  hi.mean <- rowMeans(as.matrix(df))
-  return(hi.mean)
-}
-
-#' Parses BGC output and identifies outlier SNPs
-#' @param df.list List of data.frame objects from combine_bgc_output()
-#' @param admix.pop String identifying admixed population in population map
-#'                  file
-#' @param popmap Population map file consisting of two tab-separated columns:
-#'               indID   popID (no header line)
-#' @param qn Upper quantile interval value for determining outlier SNPs.
-#'           Default = 0.975
-#' @param loci.file Path to two-column tab-separated file containing loci names
-#'                  (in same order as BGC input). Column 1 should be locus
-#'                  name, column2 should be the position of the SNP.
-#'                  header line should be: #CHROM POS
-#' @param save.obj File path. If supplied, saves outlier info as .RDS object.
-#'                 The value for save.obj should be the filename for the
-#'                 RDS object
-#' @return List containing 3 data.frame objects. 1) outlier info,
-#'         2)popmap info, 3) hybrid index info
-#' @export
-#' @examples
-#' outliers <- get_bgc_outliers(df.list = aggregated.results,
-#'                              admix.pop = "population1",
-#'                              popmap = "./popmap.txt",
-#'                              qn = 0.975,
-#'                              loci.file = "./bgc_loci.txt",
-#'                              save.obj = "./outlierInfo.rds")
-get_bgc_outliers <- function(df.list,
-                             admix.pop,
-                             popmap,
-                             qn = 0.975,
-                             loci.file = NULL,
-                             save.obj = NULL){
-
-  # Get 95% credible intervals for alpha, beta, gamma-quantile (qa),
-  # zeta-quantile (qb).
-  a <- get_cis(df.list[[2]])
-  gc()
-  b <- get_cis(df.list[[3]])
-  gc()
-  qa <- get_cis(df.list[[4]])
-  gc()
-  qb <- get_cis(df.list[[5]])
-  gc()
-  ta <- mean(unlist(bgc.genes[["ta"]])) # get estimate of Tau-alpha
-  tb <- mean(unlist(bgc.genes[["tb"]])) # get estimate of Tau-beta
-
-  clean=FALSE
-  #if no loci file provided, make a spoof one here
-  if (is.null(loci.file)){
-    clean=TRUE
-    spoof<-data.frame("#CHROM" = 0:((nrow(df.list[[2]][,1])-1)), "POS"=0:((nrow(df.list[[2]][,1])-1)), check.names = FALSE)
-    readr::write_delim(spoof, "loci_map.txt", delim=" ", col_names=TRUE)
-    loci.file="loci_map.txt"
-  }
-
-  if (!file.exists(loci.file)){
-    stop(paste0("Error: The loci file ", loci.file, " does not exist!"))
-  }
-
-  if (!file.exists(popmap)){
-    stop(paste0("Error: The popmap file ", popmap, " does not exist!"))
-  }
-
-  # Find SNPs with excess ancestry and outliers SNPs.
-  snps <- get_ab_outliers(a, b, qa, qb, loci.file, qn, ta, tb)
-  gc()
-
-  # Get hybrid indexes.
-  hi <- get_hi(df.list[[6]])
-
-  # Read in popmap.
-  # Tab-separated, two-columns: indID\tpopID
-  popmap <- read.table(file = popmap, stringsAsFactors = FALSE)
-
-  # Get data.frame with indIDs of admixed individuals and hybrid index.
-  admix <- popmap[ which(popmap$V2 == admix.pop),]
-  hi.df <- data.frame(id=admix$V1, hi=hi, stringsAsFactors = FALSE)
-
-  res <- list(snps, popmap, hi.df)
-
-  rm(a, b, qa, qb, snps, hi, popmap, admix, hi.df)
-  gc()
-  # If user specified filepath: save res as rds object.
-  if(!is.null(save.obj)){
-    saveRDS(res, save.obj)
-  }
-
-  gc()
-
-  #clean up spoofed loci_map
-  if (clean == TRUE){
-    file.remove(loci.file)
-  }
-
-  return(res)
-
-}
+#' Identify Outlier BGC SNPs Using Two Criteria
+#'
+#' These functions identify alpha and beta BGC outliers using two criteria.
+#' First, if a SNP's 95% credible interval for alpha or beta doesn't overlap
+#' with 0, it is considered to have excess ancestry (positive or negative).
+#' Second, if a SNP's alpha or beta falls outside the quantile interval
+#' qn/2 and (qn-1)/2, it is considered an outlier (positive or negative).
+#' If both criteriea are met for alpha or beta, crazy.a or crazy.b are TRUE.
+#' These criteria are discussed in the Gompert BGC papers.
+#'
+#' Get credible intervals of a BGC parameter
+#' @param df data.frame of one BGC parameter (e.g. alpha)
+#' @return data.frame with credible intervals for the parameter
+#' @noRd
+get_cis <- function(df){
+  # Get credible interval for parameters.
+  cis <- apply(df, 1, function(x) bayestestR::ci(x = x, ci=0.95))
+  cis <- do.call("rbind", cis)
+  param.mean <- rowMeans(as.matrix(df))
+  param.median <- apply(df, 1, function(x) median(x))
+
+  df2 <- data.frame(mean=param.mean,
+                    median=param.median,
+                    lb=cis$CI_low,
+                    ub=cis$CI_high,
+                    stringsAsFactors = FALSE)
+
+  # Clear memory.
+  rm(param.mean, param.median, cis)
+  gc()
+  return(df2)
+}
+
+#' Get alpha and beta outliers
+#' @param a.out data.frame of alpha BGC results
+#' @param b.out data.frame of beta BGC results
+#' @param qa.out data.frame of qa (gamma-quantile) BGC results
+#' @param qb.out data.frame of qb (zeta-quantile) BGC results
+#' @param loci Path to two-column file containing loci names (in BGC order)
+#' @param qn Upper quantile interval value
+#' @return data.frame containing SNP outlier info
+#' @noRd
+get_ab_outliers <- function(a.out, b.out, qa.out, qb.out, loci, qn, ta, tb){
+  # Get excess ancestry and outliers for alpha and beta parameters.
+  names(a.out)[3:4] <- c('lb','ub')
+  names(b.out)[3:4] <- c('lb','ub')
+  names(qa.out)[3:4] <- c('lb','ub')
+  names(qb.out)[3:4] <- c('lb','ub')
+
+  # Excess ancestry compared to genome-wide average introgression
+  # If lb > 0 or ub < 0, it has excess ancestry for P0 or P1, respectively.
+  # Do alpha.
+  a.out$excess <- NA
+  a.out$excess[a.out$lb > 0] <- 'pos'
+  a.out$excess[a.out$ub < 0] <- 'neg'
+
+  # Beta.
+  b.out$excess <- NA
+  b.out$excess[b.out$lb > 0] <- 'pos'
+  b.out$excess[b.out$ub < 0] <- 'neg'
+
+  # Add in quantiles
+  a.out$q <- qa.out$mean
+  b.out$q <- qb.out$mean
+  
+  # qnorm takes SD, so take square root of the reciprocal of tau.
+  # This basically generates the interval to check if alpha and beta are
+  # outliers.
+  # The interval is defined by qn = n/2 and qn = 1-n/2.
+  # If the median for alpha or beta fall outside that interval, it's an
+  # outlier.
+  # By default, qn is set to 0.025 and 0.975
+  qn_lower <- 1 - qn
+
+  a.out$qlb <- qnorm(qn_lower,0,sqrt(1/ta))
+  a.out$qub <- qnorm(qn,0,sqrt(1/ta))
+  
+  b.out$qlb <- qnorm(qn_lower,0,sqrt(1/tb))
+  b.out$qub <- qnorm(qn,0,sqrt(1/tb))
+  
+  
+  # Check if median falls outside qn interval.
+  a.out$outlier <- NA
+  a.out$outlier[a.out$median > a.out$qub] <- 'pos'
+  a.out$outlier[a.out$median < a.out$qlb] <- 'neg'
+
+  b.out$outlier <- NA
+  b.out$outlier[b.out$median > b.out$qub] <- 'pos'
+  b.out$outlier[b.out$median < b.out$qlb] <- 'neg'
+
+  # Read loci file generated using my vcf2bgc.py script.
+  # Contains actual scaffold names.
+  snps <- read.delim(loci, sep = " ", stringsAsFactors = FALSE)
+
+  if (nrow(snps) != nrow(a.out)){
+    writeLines(paste0("\n\nError: There are ", nrow(snps), " loci "))
+    writeLines("in the loci.file")
+    stop("loci.file length must equal rows in the BGC output")
+  }
+
+  ### Copy the data to snps data.frame.
+  # Mean a and b
+  snps$alpha <- a.out$mean
+  snps$beta <- b.out$mean
+
+  # Excess ancestry and outliers.
+  snps$alpha.excess <- a.out$excess
+  snps$beta.excess <- b.out$excess
+  snps$alpha.outlier <- a.out$outlier
+  snps$beta.outlier <- b.out$outlier
+
+  # If has both excess ancestry AND is an outlier.
+  snps$crazy.a <- !is.na(snps$alpha.excess) & !is.na(snps$alpha.outlier)
+  snps$crazy.b <- !is.na(snps$beta.excess) & !is.na(snps$beta.outlier)
+
+  # Clear memory.
+  rm(a.out, b.out, qa.out, qb.out, loci)
+  gc()
+
+  return(snps)
+}
+
+#' Gets hybrid index mean across MCMC iterations for each individual
+#' @param df data.frame of BGC hybrid index results
+#' @return Matrix containing MCMC hybrid index means
+#' @noRd
+get_hi <- function(df){
+  hi.mean <- rowMeans(as.matrix(df))
+  return(hi.mean)
+}
+
+#' Parses BGC output and identifies outlier SNPs
+#' @param df.list List of data.frame objects from combine_bgc_output()
+#' @param admix.pop String identifying admixed population in population map
+#'                  file
+#' @param popmap Population map file consisting of two tab-separated columns:
+#'               indID   popID (no header line)
+#' @param qn Upper quantile interval value for determining outlier SNPs.
+#'           Default = 0.975
+#' @param loci.file Path to two-column tab-separated file containing loci names
+#'                  (in same order as BGC input). Column 1 should be locus
+#'                  name, column2 should be the position of the SNP.
+#'                  header line should be: #CHROM POS
+#' @param save.obj File path. If supplied, saves outlier info as .RDS object.
+#'                 The value for save.obj should be the filename for the
+#'                 RDS object
+#' @return List containing 3 data.frame objects. 1) outlier info,
+#'         2)popmap info, 3) hybrid index info
+#' @export
+#' @examples
+#' outliers <- get_bgc_outliers(df.list = aggregated.results,
+#'                              admix.pop = "population1",
+#'                              popmap = "./popmap.txt",
+#'                              qn = 0.975,
+#'                              loci.file = "./bgc_loci.txt",
+#'                              save.obj = "./outlierInfo.rds")
+get_bgc_outliers <- function(df.list,
+                             admix.pop,
+                             popmap,
+                             qn = 0.975,
+                             loci.file = NULL,
+                             save.obj = NULL){
+
+  # Get 95% credible intervals for alpha, beta, gamma-quantile (qa),
+  # zeta-quantile (qb).
+  a <- get_cis(df.list[[2]])
+  gc()
+  b <- get_cis(df.list[[3]])
+  gc()
+  qa <- get_cis(df.list[[4]])
+  gc()
+  qb <- get_cis(df.list[[5]])
+  gc()
+  ta <- mean(unlist(bgc.genes[["ta"]])) # get estimate of Tau-alpha
+  tb <- mean(unlist(bgc.genes[["tb"]])) # get estimate of Tau-beta
+
+  clean=FALSE
+  #if no loci file provided, make a spoof one here
+  if (is.null(loci.file)){
+    clean=TRUE
+    spoof<-data.frame("#CHROM" = 0:((nrow(df.list[[2]][,1])-1)), "POS"=0:((nrow(df.list[[2]][,1])-1)), check.names = FALSE)
+    readr::write_delim(spoof, "loci_map.txt", delim=" ", col_names=TRUE)
+    loci.file="loci_map.txt"
+  }
+
+  if (!file.exists(loci.file)){
+    stop(paste0("Error: The loci file ", loci.file, " does not exist!"))
+  }
+
+  if (!file.exists(popmap)){
+    stop(paste0("Error: The popmap file ", popmap, " does not exist!"))
+  }
+
+  # Find SNPs with excess ancestry and outliers SNPs.
+  snps <- get_ab_outliers(a, b, qa, qb, loci.file, qn, ta, tb)
+  gc()
+
+  # Get hybrid indexes.
+  hi <- get_hi(df.list[[6]])
+
+  # Read in popmap.
+  # Tab-separated, two-columns: indID\tpopID
+  popmap <- read.table(file = popmap, stringsAsFactors = FALSE)
+
+  # Get data.frame with indIDs of admixed individuals and hybrid index.
+  admix <- popmap[ which(popmap$V2 == admix.pop),]
+  hi.df <- data.frame(id=admix$V1, hi=hi, stringsAsFactors = FALSE)
+
+  res <- list(snps, popmap, hi.df)
+
+  rm(a, b, qa, qb, snps, hi, popmap, admix, hi.df)
+  gc()
+  # If user specified filepath: save res as rds object.
+  if(!is.null(save.obj)){
+    saveRDS(res, save.obj)
+  }
+
+  gc()
+
+  #clean up spoofed loci_map
+  if (clean == TRUE){
+    file.remove(loci.file)
+  }
+
+  return(res)
+
+}