Merge branch 'main' of github.com:hiruna72/squigualiser into main

hiruna72 · May 6, 2024 · 23f063f · 23f063f
2 parents e689b46 + c005f54
commit 23f063f
Show file tree

Hide file tree

Showing 9 changed files with 233 additions and 13 deletions.
diff --git a/README.md b/README.md
@@ -634,4 +634,4 @@ These examples were generated using the testcases - `1.1, 2.1, 1.11,` and `3.2`
 Please refer to the example [pipelines](docs/pipeline_basic.md) to learn how to integrate squigualiser into your analysis.
 
 ## Acknowledgement
-Some code snippets have been taken from [blue-crab](https://github.com/Psy-Fer/blue-crab), [buttery-eel](https://github.com/Psy-Fer/buttery-eel), [readfish](https://github.com/LooseLab/readfish) and [bonito](https://github.com/nanoporetech/bonito)
+Some code snippets have been taken from [readpaf](https://pypi.org/project/readpaf/), [blue-crab](https://github.com/Psy-Fer/blue-crab), [buttery-eel](https://github.com/Psy-Fer/buttery-eel), [readfish](https://github.com/LooseLab/readfish) and [bonito](https://github.com/nanoporetech/bonito)
diff --git a/requirements.txt b/requirements.txt
@@ -1,7 +1,6 @@
 bokeh==3.1.1
 numpy
 pyslow5
-readpaf
 pyfaidx
 pyfastx
 pysam

diff --git a/src/calculate_offsets.py b/src/calculate_offsets.py
@@ -1,6 +1,6 @@
 import os
 import argparse
-from readpaf import parse_paf
+from src import readpaf_local
 import pysam
 import pyslow5
 from pyfaidx import Fasta
@@ -136,7 +136,7 @@ def calculate_offsets(args, sig_move_offset, output_pdf, s5):
         else:
             sequence_reads = Fastq(sequence_file)
         num_reads = 0
-        for paf_record in parse_paf(handle):
+        for paf_record in readpaf_local.parse_paf(handle):
             read_id = paf_record.query_name
             if args.read_id != "" and args.read_id != read_id:
                 continue

diff --git a/src/metric.py b/src/metric.py
@@ -7,7 +7,7 @@
 import pyslow5
 import argparse
 import re
-from readpaf import parse_paf
+from src import readpaf_local
 from pyfaidx import Fasta
 from pyfastx import Fastq
 import os
@@ -292,7 +292,7 @@ def run(args):
                 sequence_reads = Fasta(args.file)
             else:
                 sequence_reads = Fastq(args.file)
-            for paf_record in parse_paf(handle):
+            for paf_record in readpaf_local.parse_paf(handle):
                 metric_record = {}
                 if paf_record.query_name != paf_record.target_name:
                     raise Exception("Error: this paf file is a signal to reference mapping. Please provide the argument --sig_ref ")
@@ -880,19 +880,28 @@ def run(args):
     else:
         raise Exception("Error: You should not have ended up here. Please check your arguments")
 
+    flag_discard = 0
     if args.output_current_column:
         for key, item in column_raw_samples.items():
             # print(', '.join(map(str, item)))
             array = np.array(item)
 
             median = np.median(array)
+            if median == 0:
+                print("median is zero!")
+                flag_discard = 1
+                break
             darray = array/median
+
             print(', '.join(map(str, darray)))
 
             # darray = plot_utils.scale_signal(array, 'medmad', {})
             # print(', '.join(map(str, darray)))
 
-    print("Number of records: {}".format(num_plots))
+    if flag_discard:
+        print("discard")
+    else:
+        print("Number of records: {}".format(num_plots))
     if num_plots == 0:
         print("Squigualiser only plots reads that span across the specified region entirely. Reduce the region interval and double check with IGV : {}".format(num_plots))
 

diff --git a/src/plot.py b/src/plot.py
@@ -14,7 +14,6 @@
 import argparse
 import re
 import logging
-from readpaf import parse_paf
 from sys import stdin
 from pyfaidx import Fasta
 from pyfastx import Fastq
@@ -24,6 +23,7 @@
 from src import bed_annotation
 from src import plot_utils
 from src import calculate_offsets
+from src import readpaf_local
 
 # ref_start is always 1based closed
 # ref_end is always 1based closed
@@ -669,7 +669,7 @@ def run(args):
                 args_ref_start = None
                 args_ref_end = None
 
-            for paf_record in parse_paf(handle):
+            for paf_record in readpaf_local.parse_paf(handle):
                 if args.auto:
                     kmer_correction = 0
                     draw_data["base_shift"] = 0

diff --git a/src/plot_pileup.py b/src/plot_pileup.py
@@ -15,7 +15,6 @@
 import argparse
 import re
 import logging
-from readpaf import parse_paf
 from sys import stdin
 from pyfaidx import Fasta
 from pyfastx import Fastq

diff --git a/src/readpaf_local.py b/src/readpaf_local.py
@@ -0,0 +1,214 @@
+# We acknoweldge the authors of readpaf library and direct use of it locally. readpaf is not available as a conda package.
+from __future__ import division
+from collections import namedtuple
+
+
+__all__ = ["parse_paf"]
+
+__version__ = "0.0.11a2"
+
+try:
+    import pandas as pd
+except Exception as E:
+    pandas = False
+    e = E
+else:
+    pandas = True
+
+
+class _PAF:
+    """Base PAF methods, can't guarantee field names here so use indices"""
+
+    def __str__(self):
+        """Formats a record as a PAF line for writing to a file"""
+        return "{}\t{}".format("\t".join(map(str, self[:-1])), self._fmt_tags())
+
+    def _fmt_tags(self):
+        """Format tag dict as SAM style"""
+        return "\t".join("{}:{}:{}".format(*t) for t in self[-1].values())
+
+    def blast_identity(self):
+        """BLAST identity, see:
+        https://lh3.github.io/2018/11/25/on-the-definition-of-sequence-identity
+        """
+        return self[9] / self[10]
+
+
+SAM_TAG = namedtuple("tag", ["name", "type", "value"])
+FIELDS = [
+    "query_name",
+    "query_length",
+    "query_start",
+    "query_end",
+    "strand",
+    "target_name",
+    "target_length",
+    "target_start",
+    "target_end",
+    "residue_matches",
+    "alignment_block_length",
+    "mapping_quality",
+    "tags",
+]
+NA_VALUES = ["*"]
+SAM_TYPES = {"i": int, "A": str, "f": float, "Z": str}
+
+
+def _expand_dict_in_series(df, field):
+    """Convert a Series of dict to Series and add to the original DataFrame
+
+    Parameters
+    ----------
+    df : pd.DataFrame
+        A DataFrame with a Series of dict
+    field : str
+        The Series of dicts to expand
+
+    Returns
+    -------
+    pd.DataFrame
+        The orignal DataFrame with extra Series from the dicts
+    """
+    return df.join(
+        pd.DataFrame(
+            [{k: v for k, _, v in r.values()} for r in df.pop(field).tolist()]
+        ),
+        rsuffix="_tag",
+    )
+
+
+def _parse_tags(tags):
+    """Convert a list of SAM style tags, from a PAF file, to a dict
+
+    https://samtools.github.io/hts-specs/SAMv1.pdf section 1.5
+
+    Parameters
+    ----------
+    tags : list
+        A list of SAM style tags
+
+    Returns
+    -------
+    dict of str: namedtuple
+        Returns dict of SAM style tags.
+        Each key is the tag name and the value is a namedtuple with fields
+        `name`, `type`, and `value`.
+    """
+    return {
+        tag: SAM_TAG(tag, type_, SAM_TYPES.get(type_, lambda x: x)(val))
+        for tag, type_, val in (x.split(":", 2) for x in tags)
+    }
+
+
+def _paf_generator(file_like, fields=None, na_values=None, na_rep=None):
+    """Generator that returns namedtuples from a PAF file
+
+    Parameters
+    ----------
+    file_like : file-like object
+        File-like object
+    fields : list
+        List of field names to use for records, must have 13 entries.
+
+    Yields
+    ------
+    namedtuple
+        Correctly formatted PAF record and a dict of extra tags
+
+    Raises
+    ------
+    ValueError
+    """
+    if len(fields) != 13:
+        raise ValueError("{} fields provided, expected 13".format(len(fields)))
+    _PAF_nt = namedtuple("PAF", fields)
+    PAF = type("PAF", (_PAF, _PAF_nt), dict())
+    for record in file_like:
+        record = record.strip()
+        if not record:
+            continue
+        record = record.split("\t")
+        yield PAF(
+            str(record[0]),
+            int(record[1]) if record[1] not in na_values else na_rep,
+            int(record[2]) if record[2] not in na_values else na_rep,
+            int(record[3]) if record[3] not in na_values else na_rep,
+            str(record[4]),
+            str(record[5]),
+            int(record[6]) if record[6] not in na_values else na_rep,
+            int(record[7]) if record[7] not in na_values else na_rep,
+            int(record[8]) if record[8] not in na_values else na_rep,
+            int(record[9]) if record[9] not in na_values else na_rep,
+            int(record[10]) if record[10] not in na_values else na_rep,
+            int(record[11]) if record[11] not in na_values else na_rep,
+            _parse_tags(record[12:]),
+        )
+
+
+def parse_paf(file_like, fields=None, na_values=None, na_rep=0, dataframe=False):
+    """Read a minimap2 PAF file as either an iterator or a pandas.DataFrame
+
+    When using as an iterator the `tags` field is a list of namedtuples.
+    Each namedtuple has the fields `name`, `type`, `value` that corresponds to
+    each field (delimeted by `:`) in the SAM-style tag.
+
+    Parameters
+    ----------
+    file_like : file-like object
+        Object with a read() method, such as a sys.stdin, file handler or io.StringIO.
+    fields : list, optional
+        List of field names to use for records, must have 13 entries. These should
+        be in the order of the fields in the PAF file and the last field will be
+        used for tags.  Default:
+        ["query_name", "query_length", "query_start", "query_end", "strand",
+        "target_name", "target_length", "target_start", "target_end",
+        "residue_matches", "alignment_block_length", "mapping_quality", "tags"]
+    na_values : list[str], optional
+        List of additional strings to interpret as NaN values in numeric fields
+        (2, 3, 4, 7, 8, 9, 10, 11, 12).
+        Default: ["*"]
+    na_rep : int or float, optional
+        Value to use when a NaN value specified in `na_values` is found. Default: `0`.
+    dataframe : bool, optional
+        Default is False. When True a pandas.DataFrame is returned with Series
+        named as the `fields` parameter. SAM tags are expanded into Series as
+        well and given their specified types, if any of the field names overlap
+        with tags the tag column will be given the suffix `_tag`.
+
+    Returns
+    -------
+    iterator or pandas.DataFrame when dataframe is True
+    """
+    fields = FIELDS if fields is None else fields
+    na_values = set(NA_VALUES if na_values is None else na_values + NA_VALUES)
+    if not isinstance(na_rep, (int, float)):
+        raise ValueError("na_rep must be int or float")
+
+    if dataframe and pandas:
+        # TODO: make this nicer
+        df = pd.DataFrame(
+            (line.strip().split("\t", 12) for line in file_like if line.strip()),
+            columns=fields,
+        )
+        df = df.join(
+            pd.DataFrame(
+                df.pop(fields[-1])
+                .str.findall(r"([^\t]+?):[A-Za-z]+?:(.+?)")
+                .map(dict)
+                .to_list()
+            ),
+            rsuffix="_tag",
+        )
+        if df.empty:
+            return pd.DataFrame(columns=fields)
+        df = df.replace(
+            {
+                fields[i]: {v: na_rep for v in na_values}
+                for i in (2, 3, 4, 7, 8, 9, 10, 11, 12)
+            }
+        )
+        return df.infer_objects()
+    elif dataframe and not pandas:
+        raise ImportError(e)
+    else:
+        return _paf_generator(file_like, fields, na_values, na_rep)
diff --git a/src/realign.py b/src/realign.py
@@ -1,6 +1,6 @@
 import os
 import argparse
-from readpaf import parse_paf
+from src import readpaf_local
 import pysam
 
 BAM_CMATCH, BAM_CINS, BAM_CDEL, BAM_CREF_SKIP, BAM_CSOFT_CLIP, BAM_CHARD_CLIP, BAM_CPAD, BAM_CEQUAL, BAM_CDIFF, BAM_CBACK = range(10)
@@ -30,7 +30,7 @@ def run(args):
     # inefficient
     paf_file = open(args.paf, "r")
     paf_dic = {}
-    for record in parse_paf(paf_file):
+    for record in readpaf_local.parse_paf(paf_file):
         paf_dic[record.query_name] = record
 
     processed_sam_record_count = 0

diff --git a/src/reform.py b/src/reform.py
@@ -1,6 +1,5 @@
 import os
 import argparse
-from readpaf import parse_paf
 import pysam
 
 DEFAULT_KMER_SIZE = 9
-Original file line number
+Diff line change
@@ -1,7 +1,6 @@
     bokeh==3.1.1
     numpy
     pyslow5
-    readpaf
     pyfaidx
     pyfastx
     pysam
@@ Expand Down @@