diff --git a/src/common.py b/src/common.py index 9be377b6..c0df0348 100644 --- a/src/common.py +++ b/src/common.py @@ -9,7 +9,6 @@ import os import re import subprocess -import threading import math from collections import defaultdict from enum import Enum @@ -17,17 +16,6 @@ logger = logging.getLogger('IsoQuant') -class AtomicIDDistributor(object): - def __init__(self): - self.value = 0 - self._lock = threading.Lock() - - def increment(self): - with self._lock: - self.value += 1 - return self.value - - class CigarEvent(Enum): match = 0 insertion = 1 @@ -48,6 +36,13 @@ def get_ins_del_match_events(cls): return {cls.match, cls.insertion, cls.deletion, cls.seq_match, cls.seq_mismatch} +class TranscriptNaming: + transcript_prefix = "transcript" + novel_gene_prefix = "novel_gene_" + nic_transcript_suffix = ".nic" + nnic_transcript_suffix = ".nnic" + + # key, value def get_first_best_from_sorted(sorted_list_of_pairs): if not sorted_list_of_pairs: diff --git a/src/dataset_processor.py b/src/dataset_processor.py index f14e5008..b22054b2 100644 --- a/src/dataset_processor.py +++ b/src/dataset_processor.py @@ -624,7 +624,7 @@ def resolve_multimappers(self, chr_ids, sample, multimapped_reads): for assignment_list in multimapped_reads.values(): if len(assignment_list) > 1: - multimap_resolver.resolve(assignment_list) + assignment_list = multimap_resolver.resolve(assignment_list) resolved_lists = defaultdict(list) for a in assignment_list: resolved_lists[a.chr_id].append(a) diff --git a/src/gene_info.py b/src/gene_info.py index 277f3c21..469bceae 100644 --- a/src/gene_info.py +++ b/src/gene_info.py @@ -16,11 +16,10 @@ equal_ranges, get_intron_strand, intervals_total_length, - is_subprofile, overlaps, - junctions_from_blocks, - AtomicIDDistributor + junctions_from_blocks ) +from .id_policy import SimpleIDDistributor logger = logging.getLogger('IsoQuant') @@ -121,7 +120,7 @@ def print_debug(self): # exon/intron info class FeatureInfo: - feature_id_counter = AtomicIDDistributor() + feature_id_counter = SimpleIDDistributor() def __init__(self, chr_id, start, end, strand, type, gene_ids): self.id = FeatureInfo.feature_id_counter.increment() self.chr_id = chr_id diff --git a/src/graph_based_model_construction.py b/src/graph_based_model_construction.py index d9f71c86..1273ac18 100644 --- a/src/graph_based_model_construction.py +++ b/src/graph_based_model_construction.py @@ -11,6 +11,7 @@ from enum import unique, Enum from .common import ( + TranscriptNaming, cmp, get_exons, intersection_len, @@ -46,10 +47,6 @@ class StrandnessReportingLevel(Enum): class GraphBasedModelConstructor: - transcript_prefix = "transcript" - novel_gene_prefix = "novel_gene_" - nic_transcript_suffix = ".nic" - nnic_transcript_suffix = ".nnic" detected_known_isoforms = set() extended_transcript_ids = set() @@ -411,7 +408,7 @@ def construct_fl_isoforms(self): transcript_range = (path[0][1], path[-1][1]) novel_exons = get_exons(transcript_range, list(intron_path)) count = self.path_storage.paths[path] - new_transcript_id = self.transcript_prefix + str(self.get_transcript_id()) + new_transcript_id = TranscriptNaming.transcript_prefix + str(self.get_transcript_id()) # logger.debug("uuu %s: %s" % (new_transcript_id, str(novel_exons))) reference_isoform = None @@ -482,17 +479,17 @@ def construct_fl_isoforms(self): transcript_gene = self.select_reference_gene(intron_path, transcript_range, transcript_strand) if transcript_gene is None: - transcript_gene = (GraphBasedModelConstructor.novel_gene_prefix + self.gene_info.chr_id + + transcript_gene = (TranscriptNaming.novel_gene_prefix + self.gene_info.chr_id + "_" + str(self.get_transcript_id())) elif transcript_strand == '.': transcript_strand = self.gene_info.gene_strands[transcript_gene] if all(intron in self.known_introns for intron in intron_path): transcript_type = TranscriptModelType.novel_in_catalog - id_suffix = self.nic_transcript_suffix + id_suffix = TranscriptNaming.nic_transcript_suffix else: transcript_type = TranscriptModelType.novel_not_in_catalog - id_suffix = self.nnic_transcript_suffix + id_suffix = TranscriptNaming.nnic_transcript_suffix new_model = TranscriptModel(self.gene_info.chr_id, transcript_strand, new_transcript_id + ".%s" % self.gene_info.chr_id + id_suffix, @@ -644,11 +641,11 @@ def generate_monoexon_from_clustered(self, clustered_reads, forward=True): strand = '+' if forward else '-' coordinates = (five_prime_pos, three_prime_pos) if forward else (three_prime_pos, five_prime_pos) - new_transcript_id = self.transcript_prefix + str(self.get_transcript_id()) - transcript_gene = (GraphBasedModelConstructor.novel_gene_prefix + self.gene_info.chr_id + + new_transcript_id = TranscriptNaming.transcript_prefix + str(self.get_transcript_id()) + transcript_gene = (TranscriptNaming.novel_gene_prefix + self.gene_info.chr_id + "_" + str(self.get_transcript_id())) transcript_type = TranscriptModelType.novel_not_in_catalog - id_suffix = self.nnic_transcript_suffix + id_suffix = TranscriptNaming.nnic_transcript_suffix is_valid = True half_len = interval_len(coordinates) / 2 diff --git a/src/id_policy.py b/src/id_policy.py index 029da66b..9c725830 100644 --- a/src/id_policy.py +++ b/src/id_policy.py @@ -3,9 +3,9 @@ # # All Rights Reserved # See file LICENSE for details. ############################################################################ +import threading - -from .graph_based_model_construction import GraphBasedModelConstructor +from src.common import TranscriptNaming class SimpleIDDistributor(object): @@ -25,7 +25,7 @@ def __init__(self, genedb, chr_id): return for g in genedb.region(seqid=chr_id, start=1, featuretype="gene"): - if g.id.startswith(GraphBasedModelConstructor.novel_gene_prefix): + if g.id.startswith(TranscriptNaming.novel_gene_prefix): try: gene_num = int(g.id.split("_")[-1]) self.forbidden_ids.add(gene_num) @@ -34,9 +34,9 @@ def __init__(self, genedb, chr_id): except ValueError: pass - transcript_num_start_pos = len(GraphBasedModelConstructor.transcript_prefix) + transcript_num_start_pos = len(TranscriptNaming.transcript_prefix) for t in genedb.region(seqid=chr_id, start=1, featuretype=("transcript", "mRNA")): - if t.id.startswith(GraphBasedModelConstructor.transcript_prefix): + if t.id.startswith(TranscriptNaming.transcript_prefix): try: transcript_num = int(t.id.split(".")[0][transcript_num_start_pos:]) self.forbidden_ids.add(transcript_num) @@ -78,3 +78,14 @@ def get_id(self, chr_id, feature, strand): feature_id = self.id_dict[feature_tuple] return feature_id + + +class AtomicIDDistributor(object): + def __init__(self): + self.value = 0 + self._lock = threading.Lock() + + def increment(self): + with self._lock: + self.value += 1 + return self.value diff --git a/src/isoform_assignment.py b/src/isoform_assignment.py index 1f6250f1..db1ba011 100644 --- a/src/isoform_assignment.py +++ b/src/isoform_assignment.py @@ -7,7 +7,8 @@ import logging from enum import Enum, unique -from src.common import AtomicIDDistributor, junctions_from_blocks +from src.common import junctions_from_blocks +from src.id_policy import SimpleIDDistributor from src.serialization import * from src.polya_finder import PolyAInfo @@ -619,7 +620,7 @@ def serialize(self, outfile): class ReadAssignment: - assignment_id_generator = AtomicIDDistributor() + assignment_id_generator = SimpleIDDistributor() def __init__(self, read_id, assignment_type, match=None): self.assignment_id = ReadAssignment.assignment_id_generator.increment() diff --git a/src/multimap_resolver.py b/src/multimap_resolver.py index e4dc2c54..47b06314 100644 --- a/src/multimap_resolver.py +++ b/src/multimap_resolver.py @@ -6,10 +6,12 @@ ############################################################################ import logging +import math from enum import Enum -from collections import defaultdict from .isoform_assignment import ReadAssignmentType +from .common import intersection_len + logger = logging.getLogger('IsoQuant') @@ -51,6 +53,7 @@ def select_best_assignment(self, assignment_list): consistent_assignments = [] inconsistent_assignments = [] primary_inconsistent = [] + noninformative = [] for i, a in enumerate(assignment_list): if a.assignment_type.is_inconsistent(): @@ -61,6 +64,8 @@ def select_best_assignment(self, assignment_list): consistent_assignments.append(i) if not a.multimapper and not a.assignment_type == ReadAssignmentType.ambiguous: primary_unique.append(i) + else: + noninformative.append(i) if primary_unique: return self.filter_assignments(assignment_list, primary_unique) @@ -74,7 +79,11 @@ def select_best_assignment(self, assignment_list): if inconsistent_assignments: return self.select_best_inconsistent(assignment_list, inconsistent_assignments) - return assignment_list + if noninformative: + return self.select_noninformative(assignment_list, noninformative) + + logger.warning("Unexpected assignments in multimap resolution %s" % assignment_list[0].read_id) + return assignment_list[0:1] @staticmethod def select_best_inconsistent(assignment_list, inconsistent_assignments): @@ -184,3 +193,27 @@ def find_duplicates(assignment_list, assignment_indices): % (example_assignment.read_id, example_assignment.chr_id, example_assignment.start)) return selected_assignments + + @staticmethod + def select_noninformative(assignment_list, assignment_indices): + # triplets (overlap_length, genomic_region_start, index) + overlap_index_list = [] + max_overlap_len = 0 + + for i in assignment_indices: + assignment = assignment_list[i] + overlap_len = intersection_len(assignment.genomic_region, (assignment.start, assignment.end)) + max_overlap_len = max(overlap_len, max_overlap_len) + overlap_index_list.append((overlap_len, assignment.genomic_region[0], i)) + + # select assignment with the best overlap with genic region and lowest region start (for reproducibility) + min_region_start = math.inf + best_assignment = -1 + for info in overlap_index_list: + if info[0] == max_overlap_len and info[1] < min_region_start: + min_region_start = info[1] + best_assignment = info[2] + + assert best_assignment != -1 + return MultimapResolver.filter_assignments(assignment_list, {best_assignment}) +