cosmetic changes, removing debugging lines

ablab · Oct 26, 2023 · 14243df · 14243df
1 parent 94fb2ca
commit 14243df
Show file tree

Hide file tree

Showing 2 changed files with 19 additions and 24 deletions.
diff --git a/README.md b/README.md
@@ -200,7 +200,7 @@ interested in comparing expression between different replicas/conditions within
 
 ### Specifying input data via dataset description file
 
-This option will be deprecated in future releases.
+This option will be deprecated in future releases. To process multiple experiments, please use `--yaml` instead.
 
 If you wish to process several independent experiments in a single run, you should provide a dataset description
 file via `--fastq_list` or `--bam_list` (see description below).
@@ -234,22 +234,22 @@ To provide all input files in a single file, you can provide a yaml file via `--
 A distinct output folder with individual GTFs and abundance tables will be generated for each experiment.
 In this option, bam files with short reads for correction can be provided for each experiment.
 
-The yaml file contains a list of experiments in square brackets. The first entry in the list should be the type of files the experiments contain, written as `data format: ` followed by the type in quitation marks. The type can be either fastq or bam. Each experiment is represented by a set of curly brackets around a set of parameters. Each experiment should have a name and one or multiple input files in either fastq or bam format. Additionally it may contain one or multiple bam files with short reads. The name is provided as `name: ` followed by the experiment name in quotation marks. Both short and long read files are provided as a list of file paths in quotation marks, following `long read files: ` and `illumina bam: ` respectively. Labels for the files can also be set with `labels: `. The number of labels needs to be the same as the number of files. All entries are separated by commata. For example:
+The yaml file contains a list of experiments in square brackets. The first entry in the list should be the type of files the experiments contain, written as `data format: ` followed by the type in quotation marks. The type can be either fastq or bam. Each experiment is represented by a set of curly brackets around a set of parameters. Each experiment should have a name and one or multiple input files in either fastq or bam format. Additionally it may contain one or multiple bam files with short reads. The name is provided as `name: ` followed by the experiment name in quotation marks. Both short and long read files are provided as a list of file paths in quotation marks, following `long read files: ` and `illumina bam: ` respectively. Labels for the files can also be set with `labels: `. The number of labels needs to be the same as the number of files. All entries are separated by commata. For example:
 
 ```
 [
   data format: "fastq",
   {
     name: "experiment1",
-    long_read_files: [
+    long read files: [
       "/PATH/TO/FILE1.fastq",
       "/PATH/TO/FILE2.fastq"
     ],
     illumina bam: ["PATH/TO/ILLUMINA1.bam"]
   },
   {
     name: "experiment2",
-    long_read_files: [
+    long read files: [
       "/PATH/TO/FILE3.fastq"
     ],
     illumina bam: ["PATH/TO/ILLUMINA2.bam"]

diff --git a/src/input_data_storage.py b/src/input_data_storage.py
@@ -185,9 +185,7 @@ def get_samples_from_file(self, file_name):
             for lib in sample:
                 for in_file in lib:
                     check_input_type(in_file, self.input_type)
-
-        #print(sample_files)
-        #print(illumina_bam)
+
         return sample_files, experiment_names, readable_names_dict, illumina_bam
 
     def has_replicas(self):
@@ -211,7 +209,6 @@ def get_samples_from_yaml(self, file_name):
             if len(t.keys()) > 1:
                 logger.warning("The first entry should only specify the input data format. Any additional info will be ignored")
             if  t['data format'] == "bam":
-                #print("yes")
                 self.input_type = "bam"
                 print(self.input_type)
             elif t['data format'] == "fastq" or t['data format'] == "fasta":
@@ -232,12 +229,12 @@ def get_samples_from_yaml(self, file_name):
                     logger.warning("Duplicate folder prefix %s, will change to %s" %
                                    (current_sample_name, new_sample_name))
                     current_sample_name = new_sample_name
+            current_index += 1
             if not 'long read files' in sample.keys():
                 logger.critical("Experiment %s does not contain any files" %current_sample_name)
                 exit(-2)
             else:
                 current_sample = sample['long read files']
-                #print(current_sample)
                 names = 'labels' in sample.keys()
                 if names and not len(sample['labels']) == len(current_sample):
                     logger.critical("The number of file aliases differs from the number of files")
@@ -260,31 +257,29 @@ def get_samples_from_yaml(self, file_name):
                 else:
                     illumina_bam.append(None)
 
-        # this is one for loop too many check why it works above and see if I have a list too little
-        # either use extra list or remove one loop. check with andrey if we actually have libs in samples
-        #print(sample_files)
         for sample in sample_files:
             for lib in sample:
                 for in_file in lib:
                     check_input_type(in_file, self.input_type)
         return sample_files, experiment_names, readable_names_dict, illumina_bam
 
-    # what do I use here? sample files? or all files?
-    def get_sample_name(names, index):
-        common_characters = len(names[0])
-        common_name = names[0]
+# not functional yet
+# idea for the future to name unnamed samples by their last common folder
+    # def get_sample_name(names, index):
+        # common_characters = len(names[0])
+        # common_name = names[0]
 
-        for i in range(1, len(names)):
-            p = mismatch(common_name, names[i])
-            if p[0] < common_characters:
-                common_characters = p[0]
+        # for i in range(1, len(names)):
+            # p = mismatch(common_name, names[i])
+            # if p[0] < common_characters:
+                # common_characters = p[0]
 
-        found = common_names.rfind('/', 0, common_characters)
+        # found = common_names.rfind('/', 0, common_characters)
 
-        common_name = common_name[:found]
+        # common_name = common_name[:found]
 
-        sample_name = common_name + str(index)
-        return sample_name
+        # sample_name = common_name + str(index)
+        # return sample_name
 
 
 def check_input_type(fname, input_type):