From 894c491e92a6f9d86dc32ac1329361f7e8034955 Mon Sep 17 00:00:00 2001 From: Ben J Woodcroft Date: Sat, 23 Aug 2025 19:59:53 +1000 Subject: [PATCH] Use spline-based GC bias correction --- Cargo.toml | 2 + src/bin/coverm.rs | 26 +++++ src/cli.rs | 38 ++++++++ src/gc_bias.rs | 241 ++++++++++++++++++++++++++++++++++++++++++++++ src/lib.rs | 3 + 5 files changed, 310 insertions(+) create mode 100644 src/gc_bias.rs diff --git a/Cargo.toml b/Cargo.toml index c996c36..9a1dfcf 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -38,6 +38,8 @@ bird_tool_utils-man = "0.4.0" roff = "0.2.*" needletail = "0.5.*" csv = "1.*" +splines = "5.0" +plotters = "0.3" [dev-dependencies] assert_cli = "0.6.*" diff --git a/src/bin/coverm.rs b/src/bin/coverm.rs index 76fe0ca..abbcc23 100644 --- a/src/bin/coverm.rs +++ b/src/bin/coverm.rs @@ -498,6 +498,32 @@ fn main() { } } } + Some("gc-bias") => { + let m = matches.subcommand_matches("gc-bias").unwrap(); + set_log_level(m, true); + let reference = m.get_one::("reference").unwrap(); + let bam_files: Vec<&str> = m + .get_many::("bam-files") + .unwrap() + .map(|x| &**x) + .collect(); + let window = *m.get_one::("window-size").unwrap(); + let min_cov = *m.get_one::("min-contig-coverage").unwrap(); + let plot_path = m.get_one::("plot").map(|s| s.as_str()); + match coverm::gc_bias::gc_bias_correct( + reference, &bam_files, window, min_cov, plot_path, + ) { + Ok(res) => { + for (name, cov) in res { + println!("{name}\t{cov}"); + } + } + Err(e) => { + error!("{}", e); + process::exit(1); + } + } + } Some("contig") => { let m = matches.subcommand_matches("contig").unwrap(); bird_tool_utils::clap_utils::print_full_help_if_needed(m, contig_full_help()); diff --git a/src/cli.rs b/src/cli.rs index a3839e2..21c4dc1 100644 --- a/src/cli.rs +++ b/src/cli.rs @@ -1037,6 +1037,7 @@ Usage: coverm ... Main subcommands: \tcontig\tCalculate coverage of contigs \tgenome\tCalculate coverage of genomes +\tgc-bias\tAdjust coverage for GC bias Less used utility subcommands: \tmake\tGenerate BAM files through alignment @@ -2153,6 +2154,43 @@ Ben J. Woodcroft .action(clap::ArgAction::SetTrue), ), ) + .subcommand( + add_clap_verbosity_flags(Command::new("gc-bias")) + .about("Calculate GC bias adjusted coverage") + .arg( + Arg::new("bam-files") + .short('b') + .long("bam-files") + .required(true) + .action(clap::ArgAction::Append) + .num_args(1..), + ) + .arg( + Arg::new("reference") + .short('r') + .long("reference") + .required(true), + ) + .arg( + Arg::new("window-size") + .short('w') + .long("window-size") + .value_parser(value_parser!(usize)) + .default_value("1000"), + ) + .arg( + Arg::new("min-contig-coverage") + .long("min-contig-coverage") + .value_parser(value_parser!(f64)) + .default_value("20"), + ) + .arg( + Arg::new("plot") + .long("plot") + .value_parser(value_parser!(String)) + .help("Output PNG plot of GC bias spline"), + ), + ) .subcommand( add_clap_verbosity_flags(Command::new("shell-completion")) .about("Generate a shell completion script for coverm") diff --git a/src/gc_bias.rs b/src/gc_bias.rs new file mode 100644 index 0000000..78014f8 --- /dev/null +++ b/src/gc_bias.rs @@ -0,0 +1,241 @@ +use bio::io::fasta; +use plotters::prelude::*; +use rust_htslib::bam; +use rust_htslib::bam::Read; +use splines::{Interpolation, Key, Spline}; +use std::collections::HashMap; +use std::error::Error; + +/// Fit a spline to GC fraction and relative coverage pairs. +/// Returns the spline and the binned points used for fitting. +pub(crate) fn fit_gc_bias_spline( + gc: &[f64], + rel_cov: &[f64], +) -> (Spline, Vec<(f64, f64)>) { + assert_eq!(gc.len(), rel_cov.len()); + let bins = 20usize; + let mut bin_sums = vec![0.0f64; bins]; + let mut bin_counts = vec![0usize; bins]; + for (&g, &r) in gc.iter().zip(rel_cov.iter()) { + let g_clamped = g.clamp(0.0, 1.0); + let idx = ((g_clamped * (bins as f64 - 1.0)).round()) as usize; + bin_sums[idx] += r; + bin_counts[idx] += 1; + } + let mut points: Vec<(f64, f64)> = Vec::new(); + let mut keys: Vec> = Vec::new(); + for i in 0..bins { + if bin_counts[i] > 0 { + let x = (i as f64 + 0.5) / bins as f64; + let y = bin_sums[i] / bin_counts[i] as f64; + points.push((x, y)); + keys.push(Key::new(x, y, Interpolation::Linear)); + } + } + (Spline::from_vec(keys.clone()), points) +} + +fn plot_spline( + points: &[(f64, f64)], + spline: &Spline, + path: &str, +) -> Result<(), Box> { + let root = BitMapBackend::new(path, (640, 480)).into_drawing_area(); + root.fill(&WHITE)?; + let y_min = points.iter().map(|(_, y)| *y).fold(f64::INFINITY, f64::min); + let y_max = points + .iter() + .map(|(_, y)| *y) + .fold(f64::NEG_INFINITY, f64::max); + let mut chart = ChartBuilder::on(&root) + .caption("GC bias spline", ("sans-serif", 20)) + .margin(20) + .x_label_area_size(30) + .y_label_area_size(40) + .build_cartesian_2d(0f64..1f64, y_min..y_max)?; + chart + .configure_mesh() + .x_desc("GC fraction") + .y_desc("Relative coverage") + .draw()?; + chart.draw_series( + points + .iter() + .map(|(x, y)| Circle::new((*x, *y), 3, BLUE.filled())), + )?; + chart.draw_series(LineSeries::new( + (0..100).map(|i| { + let gc = i as f64 / 99.0; + let y = spline.clamped_sample(gc).unwrap_or(1.0); + (gc, y) + }), + &RED, + ))?; + root.present()?; + Ok(()) +} + +/// Calculate GC bias adjusted coverage for each contig given a reference fasta +/// and BAM files. Returns a vector of (contig_name, adjusted_coverage). +/// Only contigs with mean coverage >= `min_contig_cov` are used to fit the spline. +pub fn gc_bias_correct( + reference: &str, + bam_files: &[&str], + window_size: usize, + min_contig_cov: f64, + plot_path: Option<&str>, +) -> Result, Box> { + // Read reference sequences + let fasta_reader = fasta::Reader::from_file(reference)?; + let mut sequences: Vec<(String, Vec)> = Vec::new(); + for r in fasta_reader.records() { + let rec = r?; + sequences.push((rec.id().to_string(), rec.seq().to_vec())); + } + let mut name_to_index = HashMap::new(); + for (i, (name, _)) in sequences.iter().enumerate() { + name_to_index.insert(name.clone(), i); + } + + // Prepare coverage vectors + let mut coverage: Vec> = sequences + .iter() + .map(|(_, seq)| vec![0u32; seq.len()]) + .collect(); + + // Iterate through BAM files and accumulate coverage using pileup + for bam_path in bam_files { + let mut reader = bam::Reader::from_path(bam_path)?; + let header = reader.header().to_owned(); + let mut tid_to_index: HashMap = HashMap::new(); + for (tid, name_bytes) in header.target_names().iter().enumerate() { + if let Ok(name) = std::str::from_utf8(name_bytes) { + if let Some(idx) = name_to_index.get(name) { + tid_to_index.insert(tid as i32, *idx); + } + } + } + for p in reader.pileup() { + let pile = p?; + let tid = pile.tid(); + if let Some(&idx) = tid_to_index.get(&(tid as i32)) { + let pos = pile.pos() as usize; + if pos < coverage[idx].len() { + coverage[idx][pos] += pile.depth(); + } + } + } + } + + struct ContigData { + gc: Vec, + cov: Vec, + } + let mut contig_data: Vec = Vec::new(); + let mut fit_gc = Vec::new(); + let mut fit_rel_cov = Vec::new(); + + for (idx, (_name, seq)) in sequences.iter().enumerate() { + let cov_vec = &coverage[idx]; + let len = seq.len(); + let sum_cov: u64 = cov_vec.iter().map(|v| *v as u64).sum(); + let contig_mean = if len > 0 { + sum_cov as f64 / len as f64 + } else { + 0.0 + }; + let mut data = ContigData { + gc: Vec::new(), + cov: Vec::new(), + }; + let mut pos = 0usize; + while pos < len { + let end = std::cmp::min(pos + window_size, len); + let window_len = end - pos; + let gc_count = seq[pos..end] + .iter() + .filter(|b| matches!(**b, b'G' | b'g' | b'C' | b'c')) + .count(); + let gc_frac = gc_count as f64 / window_len as f64; + let cov_sum: u64 = cov_vec[pos..end].iter().map(|v| *v as u64).sum(); + let mean_cov = cov_sum as f64 / window_len as f64; + data.gc.push(gc_frac); + data.cov.push(mean_cov); + if contig_mean >= min_contig_cov && contig_mean > 0.0 { + fit_gc.push(gc_frac); + fit_rel_cov.push(mean_cov / contig_mean); + } + pos += window_size; + } + contig_data.push(data); + } + + if fit_gc.is_empty() { + return Err("No contigs with sufficient coverage for GC bias modelling".into()); + } + let (spline, points) = fit_gc_bias_spline(&fit_gc, &fit_rel_cov); + if let Some(p) = plot_path { + plot_spline(&points, &spline, p)?; + } + + let mut results = Vec::new(); + for ((name, _), data) in sequences.iter().zip(contig_data.iter()) { + if data.gc.is_empty() { + results.push((name.clone(), 0.0)); + continue; + } + let mut sum_adj = 0.0; + let mut count = 0usize; + for (g, c) in data.gc.iter().zip(data.cov.iter()) { + if let Some(pred) = spline.clamped_sample(*g) { + if pred > 0.0 { + sum_adj += c / pred; + count += 1; + } + } + } + let adj_mean = if count > 0 { + sum_adj / count as f64 + } else { + 0.0 + }; + results.push((name.clone(), adj_mean)); + } + Ok(results) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_fit_gc_bias_spline_quadratic() { + let gc = vec![0.0, 0.5, 1.0]; + let rel: Vec = gc.iter().map(|g| 1.0 + (g - 0.5) * (g - 0.5)).collect(); + let (spline, _pts) = fit_gc_bias_spline(&gc, &rel); + for g in &gc { + let pred = spline.clamped_sample(*g).unwrap(); + let true_val = 1.0 + (g - 0.5) * (g - 0.5); + assert!((pred - true_val).abs() < 0.1); + } + } + + #[test] + fn test_adjustment_restores_mean() { + let gc = vec![0.0, 0.5, 1.0]; + let cov: Vec = gc + .iter() + .map(|g| 10.0 * (1.0 + (g - 0.5) * (g - 0.5))) + .collect(); + let contig_mean = 10.0; + let rel: Vec = cov.iter().map(|c| c / contig_mean).collect(); + let (spline, _pts) = fit_gc_bias_spline(&gc, &rel); + let mut adj = Vec::new(); + for (g, c) in gc.iter().zip(cov.iter()) { + let pred = spline.clamped_sample(*g).unwrap(); + adj.push(c / pred); + } + let mean_adj: f64 = adj.iter().sum::() / adj.len() as f64; + assert!((mean_adj - 10.0).abs() < 0.1); + } +} diff --git a/src/lib.rs b/src/lib.rs index 6197f88..ab7f39a 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -5,6 +5,7 @@ pub mod coverage_printer; pub mod coverage_takers; pub mod external_command_checker; pub mod filter; +pub mod gc_bias; pub mod genome; pub mod genome_exclusion; pub mod genome_parsing; @@ -39,7 +40,9 @@ extern crate bird_tool_utils_man; extern crate csv; extern crate galah; extern crate needletail; +extern crate plotters; extern crate roff; +extern crate splines; extern crate version_compare; pub const CONCATENATED_FASTA_FILE_SEPARATOR: &str = "~";