BUSCO_2_Chrom.py

#!/usr/bin/env python3
"""
Plot chromosomes colored by reference sets of orthologous loci
(BUSCO's single copy orthologs). These sets of orthologs can be groups
of orthologs found in the same chromosome in a reference species but
could also be orthologs inferred to be found in the same chromosome
for long evolutionary time.
The most frequent reference set per chromosome will be found in gray.
Reference sets found in more than the fractional cutoff will have a
different color while the ones found in less than x fraction will all
be represented with the same color.
Tags will indicate the reference set that the color represents. These
will be represented whenever there is a change in reference, but 
hidden when the neighboring loci is the same as the previously indicated.
The telomeric sequence will be searched at the ends of the sequence. 
The sequences where the repeat was found more than 10 times contiguously
within the first or last 1000 nts are considered to be capped by telomeres.
Sequences with telomeres will a a round end while those lacking will have
blunt ends. 

Usage:
        BUSCO_2_Chrom.py --fasta <FILE> --busco <FILE> --ref <FILE> 
                          [--telomere STR] [--minLen INT] [--frac FLOAT]
                          [--width INT] [--height INT] [--title STR]
                          [--out FILE]

options:
    -t STR, --telomere STR  telomeric repeat. Varies betwen very distant taxa
                            [Default: TTAGG]
    -m INT, --minLen INT    minimum sequence length to include in plot
                            [Default: 2000000]
    -w INT, --width INT     plot width
                            [Default: 120]
    -e INT, --height INT    plot height
                            [Default: 30]
    --title STR             plot title
                            [Default: ]
    -a FLOAT, --frac FLOAT  minimum fraction of reference fraction within
                            a chromosome to be given its own colour.
                            [Default: 0.05]
    -f FILE, --fasta FILE   fasta file of sequences to paint.
    -b FILE, --busco FILE   filename busco result.
    -r FILE, --ref FILE     filename for reference busco result.
    -o FILE, --out FILE     filename for generated plot.
                            [Default: chr_paint.pdf]
"""


from Bio.SeqFeature import SeqFeature, FeatureLocation
from Bio.Graphics import BasicChromosome
from reportlab.lib.units import cm
from docopt import docopt
import seaborn as sns
import pandas as pd
import numpy as np
import re

__author__ = "Samuel Whiteford and Pablo Gonzalez de la Rosa"
__version__ = '0.0.2'


# These column names work for BUSCO V5
def read_busco(buscofile):
    busco = pd.read_csv(buscofile, index_col=None, comment='#',
                        names=["Busco_id", "Status", "Sequence",
                               "start", "end", "strand", "Score", "Length",
                               "OrthoDB_url", "Description"],
                        sep='\t')
    return busco


# Heng Li's readfq function
def readfq(fp):  # this is a generator function
    last = None  # this is a buffer keeping the last unprocessed line
    while True:  # mimic closure; is it a bad idea?
        if not last:  # the first record or a record following a fastq
            for l in fp:  # search for the start of the next record
                if l[0] in '>@':  # fasta/q header line
                    last = l[:-1]  # save this line
                    break
        if not last:
            break
        name, seqs, last = last[1:].partition(" ")[0], [], None
        for l in fp:  # read the sequence
            if l[0] in '@+>':
                last = l[:-1]
                break
            seqs.append(l[:-1])
        if not last or last[0] != '+':  # this is a fasta record
            yield name, ''.join(seqs), None  # yield a fasta record
            if not last:
                break
        else:  # this is a fastq record
            seq, leng, seqs = ''.join(seqs), 0, []
            for l in fp:  # read the quality
                seqs.append(l[:-1])
                leng += len(l) - 1
                if leng >= len(seq):  # have read enough quality
                    last = None
                    yield name, seq, ''.join(seqs)  # yield a fastq record
                    break
            if last:  # reach EOF before reading enough quality
                yield name, seq, None  # yield a fasta record instead
                break


def reverse_complement_sequence(seq):
    complement = {'A': 'T', 'C': 'G', 'G': 'C', 'T': 'A'}
    reverse_complement = "".join(complement.get(base, base)
                                 for base in reversed(seq))
    return reverse_complement


def parse_fasta(fastaFile, minLen, telomere, minTeloOccur, teloSerchSpace):
    queryfasta = open(fastaFile)
    maxLen = 0
    featDict = dict()
    telomereDict = dict()
    karyotypeDict = dict()
    #print(teloSerchSpace)
    for name, seq, qual in readfq(queryfasta):
        seqlen = len(seq)
        if seqlen > maxLen:
            maxLen = seqlen
        if seqlen > minLen:
            karyotypeDict[name] = seqlen
            featDict[name] = []
            rev_seq_start = seq[0:teloSerchSpace]
            seq_end = seq[-teloSerchSpace:]
            matches_start = re.finditer("("+reverse_complement_sequence(telomere)  + "){" + str(minTeloOccur) +",}" , str(rev_seq_start), re.I)
            matches_end = re.finditer("("+ telomere  + "){" + str(minTeloOccur) +",}" , str(seq_end), re.I)

            if any(matches_start):
                telomereDict[name + "L"] = 1
            if any(matches_end):
                telomereDict[name + "R"] = 1
    return telomereDict, featDict, karyotypeDict, maxLen


def make_feat_dict(ranked_busco_LG, colorList):
    cur_block = ''
    for i, row in ranked_busco_LG.iterrows():
        featColor = colorList[int(row[7])-1]
        if cur_block != row[1]:
            cur_block = row[1]
            feature = SeqFeature(FeatureLocation(int(row[5]), int(row[6]), ref=row[4]), strand=None,
                                qualifiers={"locus_tag": [row[1]], "color": [featColor]})
        else:
            feature = SeqFeature(FeatureLocation(int(row[5]), int(row[6]), ref=row[4]),
                                strand=None, qualifiers={"color": [featColor]})
        featDict[row[4]].append(feature)
    return featDict


def plot_chr(featDict, acceptedSeqs, karyotypeDict, telomereDict, maxLen, plotWidth, plotHeight, plotTitle, outFile):
    # initialise karyotype plot variables
    telomere_length = 25e4 
    chr_diagram = BasicChromosome.Organism()
    chr_diagram.page_size = (plotWidth*cm, plotHeight*cm)

    for seqname in acceptedSeqs:
        length = karyotypeDict[seqname]
        cur_chromosome = BasicChromosome.Chromosome(seqname)
        # Set the scale to the MAXIMUM length plus the two telomeres in bp
        cur_chromosome.scale_num = maxLen + 2 * telomere_length
        # Add an opening telomere
        if seqname + "L" in telomereDict.keys():
            start = BasicChromosome.TelomereSegment()
            start.scale = telomere_length
            cur_chromosome.add(start)
        # Add a body - again using bp as the scale length here.
        body = BasicChromosome.AnnotatedChromosomeSegment(
            length, featDict[seqname])
        body.scale = length
        cur_chromosome.add(body)
        # Add a closing telomere
        if seqname + "R" in telomereDict.keys():
            end = BasicChromosome.TelomereSegment(inverted=True)
            end.scale = telomere_length
            cur_chromosome.add(end)
            # This chromosome is done
        chr_diagram.add(cur_chromosome)
    chr_diagram.draw(outFile, plotTitle)


if __name__ == "__main__":
    args = docopt(__doc__)
    fastaFile = args['--fasta']
    queryBuscoFile = args['--busco']
    refBuscoFile = args['--ref']
    outFile = args['--out']
    plotTitle = args['--title']
    minLen = int(args['--minLen'])
    telomere = args['--telomere']
    plotWidth = int(args['--width'])
    plotHeight = int(args['--height'])
    minFrac = float(args['--frac'])
    minTeloOccur = 10
    teloSerchSpace = 1000

    print("Parsing fasta", end="\t")
    ### parse query fasta file into names and lengths
    (telomereDict, featDict, karyotypeDict, maxLen) = parse_fasta(
        fastaFile, minLen, telomere, minTeloOccur, teloSerchSpace)
    acceptedSeqs = karyotypeDict.keys()
    print("Done")
    print(len(karyotypeDict),
        "sequences over the minimum size threshold (-m)", minLen)

    print("Parsing BUSCO tables", end="\t")
    busco_ref = read_busco(refBuscoFile).query('Status == "Complete"')[["Busco_id",
                                                                        "Sequence", "start", "end"]]

    # # keep only loci found in seqs with enough length "acceptedSeqs"
    busco_query = read_busco(queryBuscoFile).query(
        'Status == "Complete" & Sequence in @acceptedSeqs')[["Busco_id", "Sequence", "start", "end"]]
    print("Done")

    print("Building one-sided alignment karyotype plot")
    ordrd_join_busco = pd.merge(busco_ref, busco_query, how='inner',
                                on=["Busco_id"]
                                ).groupby(["Sequence_y"]
                                        ).apply(lambda x: x.sort_values(["start_y"],
                                                                        ascending=True)
                                                ).reset_index(drop=True)

    # Give a single color to all reference sequences that are marginally present
    # less than minFrac of markers in each query sequence
    seqCounts = ordrd_join_busco.groupby(
        ["Sequence_y"])['Sequence_x'].value_counts()
    fracseqCounts = seqCounts.groupby(["Sequence_y"]).apply(lambda x: x/np.sum(x))
    colapseqCount = seqCounts
    colapseqCount[fracseqCounts < minFrac] = 0
    rankdSeqCounts = colapseqCount.groupby(["Sequence_y"]
                                        ).rank(method='dense', ascending=False)
    dfrankdSeqCounts = rankdSeqCounts.to_frame()
    dfrankdSeqCounts.columns = ["abundance_rank"]
    ranked_busco_LG = pd.merge(ordrd_join_busco, dfrankdSeqCounts, how='left',
                            on=["Sequence_y", "Sequence_x"]
                            ).sort_values(by=['Sequence_y', 'start_y'])
    # If a sequence is composed only of compnenets with less than the minFrac,
    # either because the minFrac is quite high or because the sequences is a poutpurri,
    # it could look like there is only the major fraction.
    # Black is used instead of gray to differentiate between tese scenarios
    max_refs_per_seq = int(max(rankdSeqCounts))
    my_gray = [(0.6, 0.6, 0.6)]
    my_black = [(0.9, 0.9, 0.9)]
    if (max_refs_per_seq > 1):
        palette = sns.color_palette("hls", max_refs_per_seq-1)
        my_cols = my_gray + list(palette)
    else:
        my_cols = my_black

    featDict = make_feat_dict(ranked_busco_LG, my_cols)
    #print(telomereDict)
    plot_chr(featDict, acceptedSeqs, karyotypeDict,
             telomereDict, maxLen, plotWidth, plotHeight, plotTitle, outFile)