From a664420a57107b296ab78566bf98253f8272f120 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jonathan=20G=C3=B6ke?=
 <10560051+jonathangoeke@users.noreply.github.com>
Date: Wed, 10 Mar 2021 09:28:29 +0800
Subject: [PATCH 1/3] Update README.md

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index db39c0d..559c042 100644
--- a/README.md
+++ b/README.md
@@ -24,7 +24,7 @@ $ pyensembl install --release 91 --species homo_sapiens  # please specify the co
 ```
 ### Documentation
 
-Please refer to the xPore documention ([https://xpore.readthedocs.io](https://xpore.readthedocs.io)) for additional information, a quick start guide, and details on the data processing and output file format.
+Please refer to the xPore documentation ([https://xpore.readthedocs.io](https://xpore.readthedocs.io)) for additional information, a quick start guide, and details on the data processing and output file format.
 
 xPore is described in detail in a preprint (https://www.biorxiv.org/content/10.1101/2020.06.18.160010v1)
 

From 93355d2bfdfc8b5a15fdbad394cfc7f554525174 Mon Sep 17 00:00:00 2001
From: obenno <obennoname@gmail.com>
Date: Tue, 24 Aug 2021 19:09:28 +0800
Subject: [PATCH 2/3] Function readGTF was modified to adapt to non-ensembl GTF
 file input

---
 xpore/scripts/dataprep.py | 14 +++++++++++---
 1 file changed, 11 insertions(+), 3 deletions(-)

diff --git a/xpore/scripts/dataprep.py b/xpore/scripts/dataprep.py
index b713f4b..beb9d70 100644
--- a/xpore/scripts/dataprep.py
+++ b/xpore/scripts/dataprep.py
@@ -178,11 +178,19 @@ def readGTF(gtf_path_or_url):
     dict={}
     for ln in gtf:
         if not ln.startswith("#"):
-            ln=ln.split("\t")
+            ln=ln.strip("\n").split("\t")
             if ln[2] == "transcript" or ln[2] == "exon":
                 chr,type,start,end=ln[0],ln[2],int(ln[3]),int(ln[4])
-                tx_id=ln[-1].split('; transcript_id "')[1].split('";')[0]
-                g_id=ln[-1].split('gene_id "')[1].split('";')[0]
+                attrList=ln[-1].split(";")
+                attrDict={}
+                for k in attrList:
+                    p=k.strip().split(" ")
+                    if len(p) == 2:
+                        attrDict[p[0]]=p[1].strip('\"')
+                ##tx_id=ln[-1].split('; transcript_id "')[1].split('";')[0]
+                ##g_id=ln[-1].split('gene_id "')[1].split('";')[0]
+                tx_id = attrDict["transcript_id"]
+                g_id = attrDict["gene_id"]
                 if tx_id not in dict:
                     dict[tx_id]={'chr':chr,'g_id':g_id,'strand':ln[6]}
                     if type not in dict[tx_id]:

From 0976da5b2cb296b444141a7ce3e831c3ab4aaf10 Mon Sep 17 00:00:00 2001
From: Yuk Kei Wan <41866052+yuukiiwa@users.noreply.github.com>
Date: Thu, 16 Sep 2021 12:21:46 +0800
Subject: [PATCH 3/3] add assert "NNNNN"

---
 xpore/scripts/dataprep.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/xpore/scripts/dataprep.py b/xpore/scripts/dataprep.py
index beb9d70..ebc304e 100644
--- a/xpore/scripts/dataprep.py
+++ b/xpore/scripts/dataprep.py
@@ -578,6 +578,7 @@ def preprocess_tx(tx_id,data_dict,out_paths,locks):
             
         try:
             assert len(set(reference_kmer_array)) == 1
+            assert list(set(reference_kmer_array))[0].count('N') == 0 ##to weed out the mapped kmers from tx_seq that contain 'N', which is not in diffmod's model_kmer
         except:
             asserted = False
             break