Skip to content

Commit

Permalink
Merge branch 'main' of github.com:hiruna72/squigualiser into main
Browse files Browse the repository at this point in the history
  • Loading branch information
hiruna72 committed May 6, 2024
2 parents e689b46 + c005f54 commit 23f063f
Show file tree
Hide file tree
Showing 9 changed files with 233 additions and 13 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -634,4 +634,4 @@ These examples were generated using the testcases - `1.1, 2.1, 1.11,` and `3.2`
Please refer to the example [pipelines](docs/pipeline_basic.md) to learn how to integrate squigualiser into your analysis.

## Acknowledgement
Some code snippets have been taken from [blue-crab](https://github.com/Psy-Fer/blue-crab), [buttery-eel](https://github.com/Psy-Fer/buttery-eel), [readfish](https://github.com/LooseLab/readfish) and [bonito](https://github.com/nanoporetech/bonito)
Some code snippets have been taken from [readpaf](https://pypi.org/project/readpaf/), [blue-crab](https://github.com/Psy-Fer/blue-crab), [buttery-eel](https://github.com/Psy-Fer/buttery-eel), [readfish](https://github.com/LooseLab/readfish) and [bonito](https://github.com/nanoporetech/bonito)
1 change: 0 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
bokeh==3.1.1
numpy
pyslow5
readpaf
pyfaidx
pyfastx
pysam
Expand Down
4 changes: 2 additions & 2 deletions src/calculate_offsets.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import os
import argparse
from readpaf import parse_paf
from src import readpaf_local
import pysam
import pyslow5
from pyfaidx import Fasta
Expand Down Expand Up @@ -136,7 +136,7 @@ def calculate_offsets(args, sig_move_offset, output_pdf, s5):
else:
sequence_reads = Fastq(sequence_file)
num_reads = 0
for paf_record in parse_paf(handle):
for paf_record in readpaf_local.parse_paf(handle):
read_id = paf_record.query_name
if args.read_id != "" and args.read_id != read_id:
continue
Expand Down
15 changes: 12 additions & 3 deletions src/metric.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
import pyslow5
import argparse
import re
from readpaf import parse_paf
from src import readpaf_local
from pyfaidx import Fasta
from pyfastx import Fastq
import os
Expand Down Expand Up @@ -292,7 +292,7 @@ def run(args):
sequence_reads = Fasta(args.file)
else:
sequence_reads = Fastq(args.file)
for paf_record in parse_paf(handle):
for paf_record in readpaf_local.parse_paf(handle):
metric_record = {}
if paf_record.query_name != paf_record.target_name:
raise Exception("Error: this paf file is a signal to reference mapping. Please provide the argument --sig_ref ")
Expand Down Expand Up @@ -880,19 +880,28 @@ def run(args):
else:
raise Exception("Error: You should not have ended up here. Please check your arguments")

flag_discard = 0
if args.output_current_column:
for key, item in column_raw_samples.items():
# print(', '.join(map(str, item)))
array = np.array(item)

median = np.median(array)
if median == 0:
print("median is zero!")
flag_discard = 1
break
darray = array/median

print(', '.join(map(str, darray)))

# darray = plot_utils.scale_signal(array, 'medmad', {})
# print(', '.join(map(str, darray)))

print("Number of records: {}".format(num_plots))
if flag_discard:
print("discard")
else:
print("Number of records: {}".format(num_plots))
if num_plots == 0:
print("Squigualiser only plots reads that span across the specified region entirely. Reduce the region interval and double check with IGV : {}".format(num_plots))

Expand Down
4 changes: 2 additions & 2 deletions src/plot.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,6 @@
import argparse
import re
import logging
from readpaf import parse_paf
from sys import stdin
from pyfaidx import Fasta
from pyfastx import Fastq
Expand All @@ -24,6 +23,7 @@
from src import bed_annotation
from src import plot_utils
from src import calculate_offsets
from src import readpaf_local

# ref_start is always 1based closed
# ref_end is always 1based closed
Expand Down Expand Up @@ -669,7 +669,7 @@ def run(args):
args_ref_start = None
args_ref_end = None

for paf_record in parse_paf(handle):
for paf_record in readpaf_local.parse_paf(handle):
if args.auto:
kmer_correction = 0
draw_data["base_shift"] = 0
Expand Down
1 change: 0 additions & 1 deletion src/plot_pileup.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,6 @@
import argparse
import re
import logging
from readpaf import parse_paf
from sys import stdin
from pyfaidx import Fasta
from pyfastx import Fastq
Expand Down
214 changes: 214 additions & 0 deletions src/readpaf_local.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,214 @@
# We acknoweldge the authors of readpaf library and direct use of it locally. readpaf is not available as a conda package.
from __future__ import division
from collections import namedtuple


__all__ = ["parse_paf"]

__version__ = "0.0.11a2"

try:
import pandas as pd
except Exception as E:
pandas = False
e = E
else:
pandas = True


class _PAF:
"""Base PAF methods, can't guarantee field names here so use indices"""

def __str__(self):
"""Formats a record as a PAF line for writing to a file"""
return "{}\t{}".format("\t".join(map(str, self[:-1])), self._fmt_tags())

def _fmt_tags(self):
"""Format tag dict as SAM style"""
return "\t".join("{}:{}:{}".format(*t) for t in self[-1].values())

def blast_identity(self):
"""BLAST identity, see:
https://lh3.github.io/2018/11/25/on-the-definition-of-sequence-identity
"""
return self[9] / self[10]


SAM_TAG = namedtuple("tag", ["name", "type", "value"])
FIELDS = [
"query_name",
"query_length",
"query_start",
"query_end",
"strand",
"target_name",
"target_length",
"target_start",
"target_end",
"residue_matches",
"alignment_block_length",
"mapping_quality",
"tags",
]
NA_VALUES = ["*"]
SAM_TYPES = {"i": int, "A": str, "f": float, "Z": str}


def _expand_dict_in_series(df, field):
"""Convert a Series of dict to Series and add to the original DataFrame
Parameters
----------
df : pd.DataFrame
A DataFrame with a Series of dict
field : str
The Series of dicts to expand
Returns
-------
pd.DataFrame
The orignal DataFrame with extra Series from the dicts
"""
return df.join(
pd.DataFrame(
[{k: v for k, _, v in r.values()} for r in df.pop(field).tolist()]
),
rsuffix="_tag",
)


def _parse_tags(tags):
"""Convert a list of SAM style tags, from a PAF file, to a dict
https://samtools.github.io/hts-specs/SAMv1.pdf section 1.5
Parameters
----------
tags : list
A list of SAM style tags
Returns
-------
dict of str: namedtuple
Returns dict of SAM style tags.
Each key is the tag name and the value is a namedtuple with fields
`name`, `type`, and `value`.
"""
return {
tag: SAM_TAG(tag, type_, SAM_TYPES.get(type_, lambda x: x)(val))
for tag, type_, val in (x.split(":", 2) for x in tags)
}


def _paf_generator(file_like, fields=None, na_values=None, na_rep=None):
"""Generator that returns namedtuples from a PAF file
Parameters
----------
file_like : file-like object
File-like object
fields : list
List of field names to use for records, must have 13 entries.
Yields
------
namedtuple
Correctly formatted PAF record and a dict of extra tags
Raises
------
ValueError
"""
if len(fields) != 13:
raise ValueError("{} fields provided, expected 13".format(len(fields)))
_PAF_nt = namedtuple("PAF", fields)
PAF = type("PAF", (_PAF, _PAF_nt), dict())
for record in file_like:
record = record.strip()
if not record:
continue
record = record.split("\t")
yield PAF(
str(record[0]),
int(record[1]) if record[1] not in na_values else na_rep,
int(record[2]) if record[2] not in na_values else na_rep,
int(record[3]) if record[3] not in na_values else na_rep,
str(record[4]),
str(record[5]),
int(record[6]) if record[6] not in na_values else na_rep,
int(record[7]) if record[7] not in na_values else na_rep,
int(record[8]) if record[8] not in na_values else na_rep,
int(record[9]) if record[9] not in na_values else na_rep,
int(record[10]) if record[10] not in na_values else na_rep,
int(record[11]) if record[11] not in na_values else na_rep,
_parse_tags(record[12:]),
)


def parse_paf(file_like, fields=None, na_values=None, na_rep=0, dataframe=False):
"""Read a minimap2 PAF file as either an iterator or a pandas.DataFrame
When using as an iterator the `tags` field is a list of namedtuples.
Each namedtuple has the fields `name`, `type`, `value` that corresponds to
each field (delimeted by `:`) in the SAM-style tag.
Parameters
----------
file_like : file-like object
Object with a read() method, such as a sys.stdin, file handler or io.StringIO.
fields : list, optional
List of field names to use for records, must have 13 entries. These should
be in the order of the fields in the PAF file and the last field will be
used for tags. Default:
["query_name", "query_length", "query_start", "query_end", "strand",
"target_name", "target_length", "target_start", "target_end",
"residue_matches", "alignment_block_length", "mapping_quality", "tags"]
na_values : list[str], optional
List of additional strings to interpret as NaN values in numeric fields
(2, 3, 4, 7, 8, 9, 10, 11, 12).
Default: ["*"]
na_rep : int or float, optional
Value to use when a NaN value specified in `na_values` is found. Default: `0`.
dataframe : bool, optional
Default is False. When True a pandas.DataFrame is returned with Series
named as the `fields` parameter. SAM tags are expanded into Series as
well and given their specified types, if any of the field names overlap
with tags the tag column will be given the suffix `_tag`.
Returns
-------
iterator or pandas.DataFrame when dataframe is True
"""
fields = FIELDS if fields is None else fields
na_values = set(NA_VALUES if na_values is None else na_values + NA_VALUES)
if not isinstance(na_rep, (int, float)):
raise ValueError("na_rep must be int or float")

if dataframe and pandas:
# TODO: make this nicer
df = pd.DataFrame(
(line.strip().split("\t", 12) for line in file_like if line.strip()),
columns=fields,
)
df = df.join(
pd.DataFrame(
df.pop(fields[-1])
.str.findall(r"([^\t]+?):[A-Za-z]+?:(.+?)")
.map(dict)
.to_list()
),
rsuffix="_tag",
)
if df.empty:
return pd.DataFrame(columns=fields)
df = df.replace(
{
fields[i]: {v: na_rep for v in na_values}
for i in (2, 3, 4, 7, 8, 9, 10, 11, 12)
}
)
return df.infer_objects()
elif dataframe and not pandas:
raise ImportError(e)
else:
return _paf_generator(file_like, fields, na_values, na_rep)
4 changes: 2 additions & 2 deletions src/realign.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import os
import argparse
from readpaf import parse_paf
from src import readpaf_local
import pysam

BAM_CMATCH, BAM_CINS, BAM_CDEL, BAM_CREF_SKIP, BAM_CSOFT_CLIP, BAM_CHARD_CLIP, BAM_CPAD, BAM_CEQUAL, BAM_CDIFF, BAM_CBACK = range(10)
Expand Down Expand Up @@ -30,7 +30,7 @@ def run(args):
# inefficient
paf_file = open(args.paf, "r")
paf_dic = {}
for record in parse_paf(paf_file):
for record in readpaf_local.parse_paf(paf_file):
paf_dic[record.query_name] = record

processed_sam_record_count = 0
Expand Down
1 change: 0 additions & 1 deletion src/reform.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
import os
import argparse
from readpaf import parse_paf
import pysam

DEFAULT_KMER_SIZE = 9
Expand Down

0 comments on commit 23f063f

Please sign in to comment.