Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Go site 2246 gorule 0000001 rnac rna types are getting mangled #676

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions ontobio/io/assocparser.py
Original file line number Diff line number Diff line change
Expand Up @@ -351,6 +351,7 @@ class Report(object):
VIOLATES_GO_RULE = "Violates GO Rule"
RULE_PASS = "Passes GO Rule"
INVALID_REFERENCES = "Only one reference per ID space allowed"
INVALID_SUBJECT_TYPE = "Invalid subject type"

def __init__(self, group="unknown", dataset="unknown", config=None):
self.messages = []
Expand Down
10 changes: 10 additions & 0 deletions ontobio/io/gafparser.py
Original file line number Diff line number Diff line change
Expand Up @@ -385,6 +385,9 @@ def to_association(gaf_line: List[str], report=None, group="unknown", dataset="u
DB_OBJECT_SYMBOL = 2
TAXON_INDEX = 12
REFERENCE_INDEX = 5
DEFAULT_SUBJECT_TYPE = 'gene_product'
STR_DEFAULT_SUBJECT_TYPE_CURIE = str(association.map_gp_type_label_to_curie(DEFAULT_SUBJECT_TYPE))

if gaf_line[DB_INDEX] == "":
report.error(source_line, Report.INVALID_IDSPACE, "EMPTY", "col1 is empty", taxon=gaf_line[TAXON_INDEX], rule=1)
return assocparser.ParseResult(source_line, [], True, report=report)
Expand Down Expand Up @@ -417,7 +420,14 @@ def to_association(gaf_line: List[str], report=None, group="unknown", dataset="u

interacting_taxon = parsed_taxons_result.parsed[1] if len(parsed_taxons_result.parsed) == 2 else None
subject_curie = association.Curie(gaf_line[0], gaf_line[1])
type_label = gaf_line[11]
subject = association.Subject(subject_curie, gaf_line[2], [gaf_line[9]], gaf_line[10].split("|"), [association.map_gp_type_label_to_curie(gaf_line[11])], taxon)
# Output warnig, if system is defaulting to gene_product
if DEFAULT_SUBJECT_TYPE != type_label and len(subject.type) == 1 and STR_DEFAULT_SUBJECT_TYPE_CURIE == str(subject.type[0]):
report.warning(source_line, Report.INVALID_SUBJECT_TYPE, type_label, "defaulting to 'gene_product'", taxon=gaf_line[TAXON_INDEX], rule=1)
if association.map_gp_type_label_to_repair_curie(type_label) is not None:
report.warning(source_line, Report.INVALID_SUBJECT_TYPE, type_label, "has been repaired", taxon=gaf_line[TAXON_INDEX], rule=1)

gpi_entity = bio_entities.get(subject_curie)
if gpi_entity is not None and subject != gpi_entity:
subject = gpi_entity
Expand Down
25 changes: 23 additions & 2 deletions ontobio/model/association.py
Original file line number Diff line number Diff line change
Expand Up @@ -209,20 +209,24 @@ def fullname_field(self, max=None) -> str:

# ===============================================================================
__default_entity_type_to_curie_mapping = bidict.bidict({
"autocatalytically_spliced_intron": Curie.from_str("SO:0000588"),
"protein_coding_gene": Curie.from_str("SO:0001217"),
"snRNA": Curie.from_str("SO:0000274"),
"ncRNA": Curie.from_str("SO:0000655"),
"rRNA": Curie.from_str("SO:0000252"),
"mRNA": Curie.from_str("SO:0000234"),
"lnc_RNA": Curie.from_str("SO:0001877"),
"lincRNA": Curie.from_str("SO:0001463"),
"lncRNA": Curie.from_str("SO:0001877"),
"tRNA": Curie.from_str("SO:0000253"),
"snoRNA": Curie.from_str("SO:0000275"),
"miRNA": Curie.from_str("SO:0000276"),
"RNA": Curie.from_str("SO:0000356"),
"scRNA": Curie.from_str("SO:0000013"),
"piRNA": Curie.from_str("SO:0001035"),
"pre_miRNA": Curie.from_str("SO:0001244"),
"tmRNA": Curie.from_str("SO:0000584"),
"scaRNA": Curie.from_str("SO:0002095"),
"siRNA": Curie.from_str("SO:0000646"),
"SRP_RNA": Curie.from_str("SO:0000590"),
"primary_transcript": Curie.from_str("SO:0000185"),
"ribozyme": Curie.from_str("SO:0000374"),
Expand All @@ -234,12 +238,14 @@ def fullname_field(self, max=None) -> str:
"hammerhead_ribozyme": Curie.from_str("SO:0000380"),
"protein": Curie.from_str("PR:000000001"),
"pseudogene": Curie.from_str("SO:0000336"),
"pseudogenic_transcript": Curie.from_str("SO:0000516"),
"gene": Curie.from_str("SO:0000704"),
"biological region": Curie.from_str("SO:0001411"),
"protein_complex": Curie.from_str("GO:0032991"),
"transcript": Curie.from_str("SO:0000673"),
"gene_product": Curie.from_str("CHEBI:33695"),
"antisense_lncRNA": Curie.from_str("SO:0001904"),
"antisense_lncRNA_gene": Curie.from_str("SO:0002182"),
"transposable_element_gene": Curie.from_str("SO:0000111"),
"gene_segment": Curie.from_str("SO:3000000"),
"genetic_marker": Curie.from_str("SO:0001645"),
Expand All @@ -257,7 +263,14 @@ def fullname_field(self, max=None) -> str:
"snRNA_gene": Curie.from_str("SO:0001268"),
"SRP_RNA_gene": Curie.from_str("SO:0001269"),
"telomerase_RNA_gene": Curie.from_str("SO:0001643"),
"tRNA_gene": Curie.from_str("SO:0001272")
"tRNA_gene": Curie.from_str("SO:0001272"),
"vault_RNA": Curie.from_str("SO:0000404"),
"Y_RNA": Curie.from_str("SO:0000405")
})

# ===============================================================================
__repair_entity_type_to_curie_mapping = bidict.bidict({
"lnc_RNA": Curie.from_str("SO:0001877")
})

def map_gp_type_label_to_curie(type_label: str) -> Curie:
Expand All @@ -267,9 +280,13 @@ def map_gp_type_label_to_curie(type_label: str) -> Curie:

This is a measure to upgrade the pseudo-labels into proper Curies. Present here are
the existing set of labels in current use, and how they should be mapped into CURIEs.
Repair Sequence Ontology (SO) labels if possible
"""
# normalized_label = type_label.translate()
global __default_entity_type_to_curie_mapping
global __repair_entity_type_to_curie_mapping
if type_label not in __default_entity_type_to_curie_mapping and type_label in __repair_entity_type_to_curie_mapping:
return __repair_entity_type_to_curie_mapping.get(type_label)
return __default_entity_type_to_curie_mapping.get(type_label, __default_entity_type_to_curie_mapping["gene_product"])

def gp_type_label_to_curie(type: Curie) -> str:
Expand All @@ -279,6 +296,10 @@ def gp_type_label_to_curie(type: Curie) -> str:
global __default_entity_type_to_curie_mapping
return __default_entity_type_to_curie_mapping.inverse.get(type, "gene_product")

def map_gp_type_label_to_repair_curie(type_label: str) -> Curie:
global __repair_entity_type_to_curie_mapping
return __repair_entity_type_to_curie_mapping.get(type_label)

@dataclass(unsafe_hash=True)
class Term:
"""
Expand Down
29 changes: 29 additions & 0 deletions tests/test_gafparser.py
Original file line number Diff line number Diff line change
Expand Up @@ -471,7 +471,36 @@ def test_obsolete_replair_of_withfrom():
assoc_result = p.parse_line(obsolete_no_replacement_line)
assert assoc_result.associations == []
assert p.report.to_report_json()["messages"]["gorule-0000020"][0]["obj"] == "GO:0016458"


def test_invalid_db_type():
#gene_product gets mapped to gene_product
line = ["UniProtKB", "P0AFI2", "parC", "", "GO:0003916", "PMID:1334483", "IDA", "", "F", "", "", "gene_product", "taxon:83333", "20081208", "EcoliWiki"]
parsed = gafparser.to_association(line)
assoc = parsed.associations[0]
assert assoc.subject.type == [association.map_gp_type_label_to_curie('gene_product')]

#protein gets mapped to protein
line = ["UniProtKB", "P0AFI2", "parC", "", "GO:0003916", "PMID:1334483", "IDA", "", "F", "", "", "protein", "taxon:83333", "20081208", "EcoliWiki"]
parsed = gafparser.to_association(line)
assoc = parsed.associations[0]
assert assoc.subject.type == [association.map_gp_type_label_to_curie('protein')]

#Unhandled types get mapped to 'gene_product'
line = ["UniProtKB", "P0AFI2", "parC", "", "GO:0003916", "PMID:1334483", "IDA", "", "F", "", "", "invalid_gene_product", "taxon:83333", "20081208", "EcoliWiki"]
parsed = gafparser.to_association(line)
assoc = parsed.associations[0]
assert assoc.subject.type == [association.map_gp_type_label_to_curie('gene_product')]
assert parsed.report.to_report_json()["messages"]["gorule-0000001"][0]["type"] == parsed.report.INVALID_SUBJECT_TYPE

#'lnc_RNA' gets repaired to 'lncRNA'
line = ["UniProtKB", "P0AFI2", "parC", "", "GO:0003916", "PMID:1334483", "IDA", "", "F", "", "", "lnc_RNA", "taxon:83333", "20081208", "EcoliWiki"]
parsed = gafparser.to_association(line)
assoc = parsed.associations[0]
assert assoc.subject.type == [association.map_gp_type_label_to_curie('lncRNA')]
assert parsed.report.to_report_json()["messages"]["gorule-0000001"][0]["type"] == parsed.report.INVALID_SUBJECT_TYPE



def test_subject_extensions_bad_curie():
"""
Expand Down
Loading