changed treatment of redundant characters for process_df and create_a…

…dapters
ryandkuster · Apr 2, 2024 · fc08561 · fc08561
1 parent 174af8f
commit fc08561
Show file tree

Hide file tree

Showing 3 changed files with 27 additions and 6 deletions.
diff --git a/readsynth.py b/readsynth.py
@@ -302,10 +302,10 @@ def create_adapters(args):
           'rs2']
     m1 = list(args.motif_dt1.keys())[0]
     m2 = list(args.motif_dt2.keys())[0]
-    a1[0] = a1[0] + m1[:args.motif_dt1[m1]]
-    a1[1] = m1[args.motif_dt1[m1]:] + a1[1]
-    a2[0] = a2[0] + m2[:args.motif_dt2[m2]]
-    a2[1] = m2[args.motif_dt2[m2]:] + a2[1]
+    a1[0] = a1[0] + args.m1[0][:args.motif_dt1[m1]]
+    a1[1] = args.m1[0][args.motif_dt1[m1]+1:] + a1[1]
+    a2[0] = a2[0] + args.m2[0][:args.motif_dt2[m2]]
+    a2[1] = args.m2[0][args.motif_dt2[m2]+1:] + a2[1]
 
     return [a1], [a2]
 
@@ -489,6 +489,17 @@ def process_df(df, digest_file, args):
             inplace=True)
     df = df.reset_index(drop=True)
 
+    """
+    remove fragments where RE motif sites overlap
+    """
+    df['min_len_m1'] = df['m1'].map(args.motif_len)
+    df['min_len_m2'] = df['m2'].map(args.motif_len)
+    df['min_len'] = df['min_len_m1'] + df['min_len_m2']
+    df = df[df['seq'].str.len() > df['min_len']]
+    df.drop('min_len_m1', axis=1, inplace=True)
+    df.drop('min_len_m2', axis=1, inplace=True)
+    df.drop('min_len', axis=1, inplace=True)
+
     """
     convert all redundant IUPAC codes to 'N'
     """
@@ -536,8 +547,9 @@ def process_df(df, digest_file, args):
     """
     add a quick step that removes appropriate over/underhang
     """
+
     for mot, front in args.motif_dt.items():
-        back = len(mot) - front
+        back = args.motif_len[mot] - front
         df.loc[(df['m1'] == mot) & (df['reverse'] == 0), 'seq'] = \
             df['seq'].str[front:]
         if back != 0:
@@ -560,7 +572,10 @@ def process_df(df, digest_file, args):
             df.loc[(df['m2'] == mot) & (df['reverse'] == 1), 'revc'] = \
                 df['revc'].str[:-back]
 
+
     df['length'] = df['seq'].str.len()
+    #df = df[(df['seq'].str.len() > 0) & (df['revc'].str.len() > 0)]
+
     df = df.sort_values(by=['length'])
     df = df.reset_index(drop=True)
     df.to_csv(digest_file, index=None)

diff --git a/scripts/write_reads.py b/scripts/write_reads.py
@@ -3,7 +3,7 @@
 import numpy as np
 import pandas as pd
 import random
-import sys #TODO
+import sys
 
 def main(df, r1, r2, gen_name, args):
     '''

diff --git a/test/test_readsynth.py b/test/test_readsynth.py
@@ -144,6 +144,8 @@ def test_create_adapters_1(self):
         args = Variables()
         args.motif_dt1 = {'GCGC': 3}
         args.motif_dt2 = {'TTAA': 1}
+        args.m1 = ['GCG/C']
+        args.m2 = ['T/TAA']
         a1 = [['AATGATACGGCGACCACCGAGATCTACACTCGTCGGCAGCGTCAGATGTGTATAAGAGACAGGCG',
                'CCTGTCTCTTATACACATCTGACGCTGCCGACGAGTGTAGATCTCGGTGGTCGCCGTATCATT',
                'rs1']]
@@ -158,6 +160,8 @@ def test_create_adapters_2(self):
         args = Variables()
         args.motif_dt1 = {'GAATTC': 1}
         args.motif_dt2 = {'TTAA': 1}
+        args.m1 = ['G/AATTC']
+        args.m2 = ['T/TAA']
         a1 = [['AATGATACGGCGACCACCGAGATCTACACTCGTCGGCAGCGTCAGATGTGTATAAGAGACAGG',
                'AATTCCTGTCTCTTATACACATCTGACGCTGCCGACGAGTGTAGATCTCGGTGGTCGCCGTATCATT',
                'rs1']]
@@ -374,6 +378,7 @@ def test_process_df_1(self):
         args.motif_dt = {'GCGC': 3}
         args.motif_dt1 = {'GCGC': 3}
         args.motif_dt2 = {'GCGC': 3}
+        args.motif_len = {'GCGC': 4}
         df = pd.read_csv(
             'test_data/genomes/pre_process_hhai_hhai_test1.fasta.csv')
         digest_file = 'test_data/genomes/hhai_hhai_process_df_test.csv'
@@ -392,6 +397,7 @@ def test_process_df_2(self):
         args.motif_dt = {'GCGC': 3, 'TTAA': 1}
         args.motif_dt1 = {'GCGC': 3}
         args.motif_dt2 = {'TTAA': 1}
+        args.motif_len = {'GCGC': 4, 'TTAA': 4}
         df = pd.read_csv(
             'test_data/genomes/pre_process_hhai_msei_test1.fasta.csv')
         digest_file = 'test_data/genomes/hhai_msei_process_df_test.csv'