Skip to content

Commit

Permalink
feat(derive): report by read group where appropriate and feasible
Browse files Browse the repository at this point in the history
  • Loading branch information
a-frantz committed Mar 23, 2024
1 parent 406a7b8 commit f83015f
Show file tree
Hide file tree
Showing 6 changed files with 359 additions and 89 deletions.
2 changes: 1 addition & 1 deletion src/derive/command/encoding.rs
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ pub struct DeriveEncodingArgs {

/// Main function for the `ngs derive encoding` subcommand.
pub fn derive(args: DeriveEncodingArgs) -> anyhow::Result<()> {
info!("Starting derive readlen subcommand.");
info!("Starting derive encoding subcommand.");

let file = std::fs::File::open(args.src);
let reader = file
Expand Down
6 changes: 3 additions & 3 deletions src/derive/command/endedness.rs
Original file line number Diff line number Diff line change
Expand Up @@ -141,21 +141,21 @@ pub fn derive(args: DeriveEndednessArgs) -> anyhow::Result<()> {
counter.get().to_formatted_string(&Locale::en)
);

// (1.5) Validate the read group information.
// (2) Validate the read group information.
let rgs_in_header_not_records = validate_read_group_info(&found_rgs, &header.parsed);
for rg_id in rgs_in_header_not_records {
ordering_flags.insert(Arc::new(rg_id), OrderingFlagsCounts::new());
}

// (2) Derive the endedness based on the ordering flags gathered.
// (3) Derive the endedness based on the ordering flags gathered.
let result = compute::predict(
ordering_flags,
read_names,
paired_deviance,
args.round_reads_per_template,
);

// (3) Print the output to stdout as JSON (more support for different output
// (4) Print the output to stdout as JSON (more support for different output
// types may be added in the future, but for now, only JSON).
let output = serde_json::to_string_pretty(&result).unwrap();
println!("{}", output);
Expand Down
35 changes: 23 additions & 12 deletions src/derive/command/instrument.rs
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
//! Functionality relating to the `ngs derive instrument` subcommand itself.
use anyhow::bail;
use clap::Args;
use num_format::{Locale, ToFormattedString};
use std::collections::HashSet;
use std::collections::{HashMap, HashSet};
use std::path::PathBuf;
use std::sync::Arc;
use tracing::info;

use crate::derive::instrument::compute;
Expand All @@ -13,6 +13,7 @@ use crate::utils::args::NumberOfRecords;
use crate::utils::display::RecordCounter;
use crate::utils::formats::bam::ParsedBAMFile;
use crate::utils::formats::utils::IndexCheck;
use crate::utils::read_groups::{get_read_group, validate_read_group_info, ReadGroupPtr};

/// Clap arguments for the `ngs derive instrument` subcommand.
#[derive(Args)]
Expand All @@ -34,9 +35,10 @@ pub struct DeriveInstrumentArgs {
/// Main function for the `ngs derive instrument` subcommand.
pub fn derive(args: DeriveInstrumentArgs) -> anyhow::Result<()> {
let src = args.src;
let mut instrument_names = HashSet::new();
let mut flowcell_names = HashSet::new();
let mut instrument_names: HashMap<ReadGroupPtr, HashSet<String>> = HashMap::new();
let mut flowcell_names: HashMap<ReadGroupPtr, HashSet<String>> = HashMap::new();
let mut metrics = compute::RecordMetrics::default();
let mut found_rgs = HashSet::new();

info!("Starting derive instrument subcommand.");

Expand All @@ -49,24 +51,25 @@ pub fn derive(args: DeriveInstrumentArgs) -> anyhow::Result<()> {
let mut counter = RecordCounter::default();
for result in reader.records(&header.parsed) {
let record = result?;
let read_group = get_read_group(&record, Some(&mut found_rgs));

if let Some(read_name) = record.read_name() {
let name: &str = read_name.as_ref();

match name.parse::<IlluminaReadName>() {
Ok(read) => {
instrument_names.insert(read.instrument_name);
instrument_names
.entry(read_group.clone())
.or_default()
.insert(read.instrument_name);
metrics.found_instrument_name += 1;
if let Some(fc) = read.flowcell {
flowcell_names.insert(fc);
flowcell_names.entry(read_group).or_default().insert(fc);
metrics.found_flowcell_name += 1;
}
}
Err(_) => {
bail!(
"Could not parse Illumina-formatted query names for read: {}",
name
);
metrics.bad_read_name += 1;
}
}
} else {
Expand All @@ -85,12 +88,20 @@ pub fn derive(args: DeriveInstrumentArgs) -> anyhow::Result<()> {
);
metrics.total_records = counter.get();

// (2) Derive the predict instrument results based on these detected
// (2) Validate the read group information.
let rgs_in_header_not_records = validate_read_group_info(&found_rgs, &header.parsed);
for rg_id in rgs_in_header_not_records {
let rg_ptr = Arc::new(rg_id);
instrument_names.insert(rg_ptr.clone(), HashSet::new());
flowcell_names.insert(rg_ptr, HashSet::new());
}

// (3) Derive the instrument results based on the detected
// instrument names and flowcell names.
let mut result = compute::predict(instrument_names, flowcell_names);
result.records = metrics;

// (3) Print the output to stdout as JSON (more support for different output
// (4) Print the output to stdout as JSON (more support for different output
// types may be added in the future, but for now, only JSON).
let output = serde_json::to_string_pretty(&result).unwrap();
println!("{}", output);
Expand Down
25 changes: 20 additions & 5 deletions src/derive/command/readlen.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,9 @@ use anyhow::Context;
use clap::Args;
use num_format::{Locale, ToFormattedString};
use std::collections::HashMap;
use std::collections::HashSet;
use std::path::PathBuf;
use std::sync::Arc;
use tracing::info;

use crate::derive::readlen::compute;
Expand All @@ -13,6 +15,7 @@ use crate::utils::args::NumberOfRecords;
use crate::utils::display::RecordCounter;
use crate::utils::formats::bam::ParsedBAMFile;
use crate::utils::formats::utils::IndexCheck;
use crate::utils::read_groups::{get_read_group, validate_read_group_info, ReadGroupPtr};

/// Clap arguments for the `ngs derive readlen` subcommand.
#[derive(Args)]
Expand Down Expand Up @@ -41,7 +44,8 @@ pub fn derive(args: DeriveReadlenArgs) -> anyhow::Result<()> {
let majority_vote_cutoff = cutoff_in_range(args.majority_vote_cutoff, 0.0..=1.0)
.with_context(|| "Majority vote cutoff is not within acceptable range")?;

let mut read_lengths = HashMap::new();
let mut read_lengths: HashMap<ReadGroupPtr, HashMap<usize, usize>> = HashMap::new();
let mut found_rgs = HashSet::new();

info!("Starting derive readlen subcommand.");

Expand All @@ -54,9 +58,14 @@ pub fn derive(args: DeriveReadlenArgs) -> anyhow::Result<()> {
let mut counter = RecordCounter::default();
for result in reader.records(&header.parsed) {
let record = result?;
let read_group = get_read_group(&record, Some(&mut found_rgs));
let len = record.sequence().len();

*read_lengths.entry(len).or_default() += 1;
*read_lengths
.entry(read_group)
.or_default()
.entry(len)
.or_default() += 1;

counter.inc();
if counter.time_to_break(&args.num_records) {
Expand All @@ -69,10 +78,16 @@ pub fn derive(args: DeriveReadlenArgs) -> anyhow::Result<()> {
counter.get().to_formatted_string(&Locale::en)
);

// (2) Derive the consensus read length based on the read lengths gathered.
let result = compute::predict(read_lengths, counter.get(), majority_vote_cutoff).unwrap();
// (2) Validate the read group information.
let rgs_in_header_not_records = validate_read_group_info(&found_rgs, &header.parsed);
for rg_id in rgs_in_header_not_records {
read_lengths.insert(Arc::new(rg_id), HashMap::new());
}

// (3) Derive the consensus read length based on the read lengths gathered.
let result = compute::predict(read_lengths, majority_vote_cutoff);

// (3) Print the output to stdout as JSON (more support for different output
// (4) Print the output to stdout as JSON (more support for different output
// types may be added in the future, but for now, only JSON).
let output = serde_json::to_string_pretty(&result).unwrap();
println!("{}", output);
Expand Down
Loading

0 comments on commit f83015f

Please sign in to comment.