Skip to content

Commit

Permalink
changed treatment of redundant characters for process_df and create_a…
Browse files Browse the repository at this point in the history
…dapters
  • Loading branch information
ryandkuster committed Apr 2, 2024
1 parent 174af8f commit fc08561
Show file tree
Hide file tree
Showing 3 changed files with 27 additions and 6 deletions.
25 changes: 20 additions & 5 deletions readsynth.py
Original file line number Diff line number Diff line change
Expand Up @@ -302,10 +302,10 @@ def create_adapters(args):
'rs2']
m1 = list(args.motif_dt1.keys())[0]
m2 = list(args.motif_dt2.keys())[0]
a1[0] = a1[0] + m1[:args.motif_dt1[m1]]
a1[1] = m1[args.motif_dt1[m1]:] + a1[1]
a2[0] = a2[0] + m2[:args.motif_dt2[m2]]
a2[1] = m2[args.motif_dt2[m2]:] + a2[1]
a1[0] = a1[0] + args.m1[0][:args.motif_dt1[m1]]
a1[1] = args.m1[0][args.motif_dt1[m1]+1:] + a1[1]
a2[0] = a2[0] + args.m2[0][:args.motif_dt2[m2]]
a2[1] = args.m2[0][args.motif_dt2[m2]+1:] + a2[1]

return [a1], [a2]

Expand Down Expand Up @@ -489,6 +489,17 @@ def process_df(df, digest_file, args):
inplace=True)
df = df.reset_index(drop=True)

"""
remove fragments where RE motif sites overlap
"""
df['min_len_m1'] = df['m1'].map(args.motif_len)
df['min_len_m2'] = df['m2'].map(args.motif_len)
df['min_len'] = df['min_len_m1'] + df['min_len_m2']
df = df[df['seq'].str.len() > df['min_len']]
df.drop('min_len_m1', axis=1, inplace=True)
df.drop('min_len_m2', axis=1, inplace=True)
df.drop('min_len', axis=1, inplace=True)

"""
convert all redundant IUPAC codes to 'N'
"""
Expand Down Expand Up @@ -536,8 +547,9 @@ def process_df(df, digest_file, args):
"""
add a quick step that removes appropriate over/underhang
"""

for mot, front in args.motif_dt.items():
back = len(mot) - front
back = args.motif_len[mot] - front
df.loc[(df['m1'] == mot) & (df['reverse'] == 0), 'seq'] = \
df['seq'].str[front:]
if back != 0:
Expand All @@ -560,7 +572,10 @@ def process_df(df, digest_file, args):
df.loc[(df['m2'] == mot) & (df['reverse'] == 1), 'revc'] = \
df['revc'].str[:-back]


df['length'] = df['seq'].str.len()
#df = df[(df['seq'].str.len() > 0) & (df['revc'].str.len() > 0)]

df = df.sort_values(by=['length'])
df = df.reset_index(drop=True)
df.to_csv(digest_file, index=None)
Expand Down
2 changes: 1 addition & 1 deletion scripts/write_reads.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
import numpy as np
import pandas as pd
import random
import sys #TODO
import sys

def main(df, r1, r2, gen_name, args):
'''
Expand Down
6 changes: 6 additions & 0 deletions test/test_readsynth.py
Original file line number Diff line number Diff line change
Expand Up @@ -144,6 +144,8 @@ def test_create_adapters_1(self):
args = Variables()
args.motif_dt1 = {'GCGC': 3}
args.motif_dt2 = {'TTAA': 1}
args.m1 = ['GCG/C']
args.m2 = ['T/TAA']
a1 = [['AATGATACGGCGACCACCGAGATCTACACTCGTCGGCAGCGTCAGATGTGTATAAGAGACAGGCG',
'CCTGTCTCTTATACACATCTGACGCTGCCGACGAGTGTAGATCTCGGTGGTCGCCGTATCATT',
'rs1']]
Expand All @@ -158,6 +160,8 @@ def test_create_adapters_2(self):
args = Variables()
args.motif_dt1 = {'GAATTC': 1}
args.motif_dt2 = {'TTAA': 1}
args.m1 = ['G/AATTC']
args.m2 = ['T/TAA']
a1 = [['AATGATACGGCGACCACCGAGATCTACACTCGTCGGCAGCGTCAGATGTGTATAAGAGACAGG',
'AATTCCTGTCTCTTATACACATCTGACGCTGCCGACGAGTGTAGATCTCGGTGGTCGCCGTATCATT',
'rs1']]
Expand Down Expand Up @@ -374,6 +378,7 @@ def test_process_df_1(self):
args.motif_dt = {'GCGC': 3}
args.motif_dt1 = {'GCGC': 3}
args.motif_dt2 = {'GCGC': 3}
args.motif_len = {'GCGC': 4}
df = pd.read_csv(
'test_data/genomes/pre_process_hhai_hhai_test1.fasta.csv')
digest_file = 'test_data/genomes/hhai_hhai_process_df_test.csv'
Expand All @@ -392,6 +397,7 @@ def test_process_df_2(self):
args.motif_dt = {'GCGC': 3, 'TTAA': 1}
args.motif_dt1 = {'GCGC': 3}
args.motif_dt2 = {'TTAA': 1}
args.motif_len = {'GCGC': 4, 'TTAA': 4}
df = pd.read_csv(
'test_data/genomes/pre_process_hhai_msei_test1.fasta.csv')
digest_file = 'test_data/genomes/hhai_msei_process_df_test.csv'
Expand Down

0 comments on commit fc08561

Please sign in to comment.