From a664420a57107b296ab78566bf98253f8272f120 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jonathan=20G=C3=B6ke?= <10560051+jonathangoeke@users.noreply.github.com> Date: Wed, 10 Mar 2021 09:28:29 +0800 Subject: [PATCH 1/3] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index db39c0d..559c042 100644 --- a/README.md +++ b/README.md @@ -24,7 +24,7 @@ $ pyensembl install --release 91 --species homo_sapiens # please specify the co ``` ### Documentation -Please refer to the xPore documention ([https://xpore.readthedocs.io](https://xpore.readthedocs.io)) for additional information, a quick start guide, and details on the data processing and output file format. +Please refer to the xPore documentation ([https://xpore.readthedocs.io](https://xpore.readthedocs.io)) for additional information, a quick start guide, and details on the data processing and output file format. xPore is described in detail in a preprint (https://www.biorxiv.org/content/10.1101/2020.06.18.160010v1) From 93355d2bfdfc8b5a15fdbad394cfc7f554525174 Mon Sep 17 00:00:00 2001 From: obenno Date: Tue, 24 Aug 2021 19:09:28 +0800 Subject: [PATCH 2/3] Function readGTF was modified to adapt to non-ensembl GTF file input --- xpore/scripts/dataprep.py | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/xpore/scripts/dataprep.py b/xpore/scripts/dataprep.py index b713f4b..beb9d70 100644 --- a/xpore/scripts/dataprep.py +++ b/xpore/scripts/dataprep.py @@ -178,11 +178,19 @@ def readGTF(gtf_path_or_url): dict={} for ln in gtf: if not ln.startswith("#"): - ln=ln.split("\t") + ln=ln.strip("\n").split("\t") if ln[2] == "transcript" or ln[2] == "exon": chr,type,start,end=ln[0],ln[2],int(ln[3]),int(ln[4]) - tx_id=ln[-1].split('; transcript_id "')[1].split('";')[0] - g_id=ln[-1].split('gene_id "')[1].split('";')[0] + attrList=ln[-1].split(";") + attrDict={} + for k in attrList: + p=k.strip().split(" ") + if len(p) == 2: + attrDict[p[0]]=p[1].strip('\"') + ##tx_id=ln[-1].split('; transcript_id "')[1].split('";')[0] + ##g_id=ln[-1].split('gene_id "')[1].split('";')[0] + tx_id = attrDict["transcript_id"] + g_id = attrDict["gene_id"] if tx_id not in dict: dict[tx_id]={'chr':chr,'g_id':g_id,'strand':ln[6]} if type not in dict[tx_id]: From 0976da5b2cb296b444141a7ce3e831c3ab4aaf10 Mon Sep 17 00:00:00 2001 From: Yuk Kei Wan <41866052+yuukiiwa@users.noreply.github.com> Date: Thu, 16 Sep 2021 12:21:46 +0800 Subject: [PATCH 3/3] add assert "NNNNN" --- xpore/scripts/dataprep.py | 1 + 1 file changed, 1 insertion(+) diff --git a/xpore/scripts/dataprep.py b/xpore/scripts/dataprep.py index beb9d70..ebc304e 100644 --- a/xpore/scripts/dataprep.py +++ b/xpore/scripts/dataprep.py @@ -578,6 +578,7 @@ def preprocess_tx(tx_id,data_dict,out_paths,locks): try: assert len(set(reference_kmer_array)) == 1 + assert list(set(reference_kmer_array))[0].count('N') == 0 ##to weed out the mapped kmers from tx_seq that contain 'N', which is not in diffmod's model_kmer except: asserted = False break