-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathPepStats_Tables_v1.0.0.py
executable file
·494 lines (409 loc) · 21.1 KB
/
PepStats_Tables_v1.0.0.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
#!/usr/bin/env python3
import platform
import os
import sys
import shutil
import argparse
import subprocess
import tempfile
import datetime
import locale
import re
from textwrap import dedent
from packaging import version
# Defining Script Name
script_name = os.path.basename(sys.argv[0])
# Defining Script Current Version
script_version = "1.0.0"
# Defining_Script_Initial_Version_Data (date '+DATE:%Y/%m/%d%tTIME:%R')
version_date_initial = "DATE:2021/07/17 TIME:00:00"
# Defining_Script_Current_Version_Data (date '+DATE:%Y/%m/%d%tTIME:%R')
version_date_current = "DATE:2024/02/21 TIME:15:58"
copyright = dedent("""
This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free
Software Foundation, either version 3 of the License, or (at your option) any later version.
This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
You should have received a copy of the GNU General Public License along with this program. If not, see <https://www.gnu.org/licenses/>. """)
authors = dedent("""
Author: Rodolfo Aramayo
WORK_EMAIL: [email protected]
PERSONAL_EMAIL: [email protected]
""")
usage = dedent(f"""
########################################################################################################################################################################################################
ARAMAYO_LAB
{copyright}
SCRIPT_NAME: {script_name}
SCRIPT_VERSION: {script_version}
USAGE: {script_name}
-p Homo_sapiens.GRCh38.pep.all.fa # REQUIRED (Proteins File - Proteome)
-r PepStats_Tables # OPTIONAL (Run Name)
-z TMPDIR Location # OPTIONAL (default=0='TMP TMPDIR Run')
TYPICAL COMMANDS:
{script_name} -p Homo_sapiens.GRCh38.pep.all.fa -r PepStats_Tables
INPUT01: -p FLAG REQUIRED - Protein File
INPUT01_FORMAT: Fasta Format
INPUT01_DEFAULT: No default
INPUT02: -r FLAG OPTIONAL - Run Name
INPUT02_FORMAT: Text
INPUT02_DEFAULT: PepStats_Tables
INPUT03: -z FLAG OPTIONAL input
INPUT03_FORMAT: Numeric: 0 == TMP TMPDIR Run | 1 == Local TMP Run
INPUT03_DEFAULT: 0 == TMP TMPDIR Run
INPUT03_NOTES: 0 Processes the data in the TMP $TMPDIR directory of the computer used or of the node assigned by the SuperComputer scheduler
INPUT03_NOTES: Processing the data in the $TMPDIR directory of the node assigned by the SuperComputer scheduler reduces the possibility of file error generation due to network traffic
INPUT03_NOTES: 1 Processes the data in the same directory where the script is being run
DEPENDENCIES: EMBOSS: Required (see: http://emboss.open-bio.org/html/adm/ch01s01.html)
{authors}
########################################################################################################################################################################################################
""")
# print (usage)
def error_handling_function():
print("An error occurred. Invoking the error handling function.")
print (usage)
sys.exit(1)
def setup_argparse():
parser = argparse.ArgumentParser(
description='Script Description and Usage',
formatter_class=argparse.RawTextHelpFormatter
)
parser.add_argument(
'-p', '--proteome',
required=True,
help='Proteins File - Proteome'
)
parser.add_argument(
'-r',
'--run_name',
default='PepStats_Tables',
help='Optional Run Name'
)
parser.add_argument(
'-z',
'--tmp_dir',
type=int,
choices=[0, 1],
default=0,
help='TMPDIR Location (0 for TMP TMPDIR Run, 1 for Local TMPDIR Run)'
)
parser.add_argument(
'-v',
'--version',
action='version',
version=f'%(prog)s {script_version}',
help='Show program\'s version number and exit'
)
return parser
# Check if no arguments were provided
if len(sys.argv) == 1:
print("\nPlease enter required arguments")
print (usage)
sys.exit(1)
def main():
parser = setup_argparse()
args = parser.parse_args() # This line parses the command-line arguments
# Test print argument names and their values
# print("Entered command-line arguments and values:")
# for arg in vars(args):
# print(f"{arg}: {getattr(args, arg)}")
# Extract and print the name of the protein file
proteome_file_name = os.path.basename(args.proteome)
# print(f"Proteome File Name: {proteome_file_name}") # test_print_var
proteome, _ = os.path.splitext(proteome_file_name)
# print(f"Proteome Name: {proteome}") # test_print_var
run_name = os.path.basename(args.run_name)
# print(f"Run name: {run_name}") # test_print_var
tmp_dir = (args.tmp_dir)
# print(f"TMP dir: {tmp_dir}") # test_print_var
# Construct the directory path
current_working_directory = os.getcwd()
# print(f"Current Working Directory:", current_working_directory) # test_print_var
current_working_directory_path = os.path.dirname(current_working_directory)
# print(f"Directory Path: {current_working_directory_path}") # test_print_var
working_directory_name = f"{proteome_file_name}_{run_name}.dir"
# print(f"Working Directory Name: {working_directory_name}") # test_print_var
# Check if the working directory exists
if not os.path.isdir(working_directory_name):
# If the directory does not exist, create it
os.makedirs(working_directory_name)
else:
# If the directory exists, remove its contents
for filename in os.listdir(working_directory_name):
file_path = os.path.join(working_directory_name, filename)
try:
# If it's a file, remove it
if os.path.isfile(file_path) or os.path.islink(file_path):
os.unlink(file_path)
# If it's a directory, remove it and all its contents
elif os.path.isdir(file_path):
shutil.rmtree(file_path)
except Exception as e:
print(f'Failed to delete {file_path}. Reason: {e}')
def generate_or_clean_tmp_directory(proteome, run_name, tmp_dir):
global var_script_tmp_data_dir
if tmp_dir == 0:
# Use system TMPDIR if defined, else use a temporary directory
base_dir = os.getenv('TMPDIR', None)
if base_dir is not None:
# TMPDIR is defined; create a temp directory there
var_script_tmp_data_dir = tempfile.mkdtemp(prefix=f"{proteome_file_name}_{run_name}_", dir=base_dir)
else:
# TMPDIR not defined; create a temp directory using default location
var_script_tmp_data_dir = tempfile.mkdtemp(prefix=f"{proteome_file_name}_{run_name}_")
# print(f"Temporary directory created at: {var_script_tmp_data_dir}") # test_print_var
elif tmp_dir == 1:
# Define the script TMP data directory path
var_script_tmp_data_dir = os.path.join(os.getcwd(), f"{proteome_file_name}_{run_name}.tmp")
# Check if the directory exists
if not os.path.exists(var_script_tmp_data_dir):
os.makedirs(var_script_tmp_data_dir)
else:
# Move and rename the existing directory before creating a new one
new_dir_name = f"{var_script_tmp_data_dir}_{datetime.datetime.now().strftime('%Y_%m_%d_%H%M%S')}"
os.rename(var_script_tmp_data_dir, new_dir_name)
os.makedirs(var_script_tmp_data_dir)
# print(f"Temporary directory created at: {var_script_tmp_data_dir}") # test_print_var
generate_or_clean_tmp_directory(proteome, run_name, tmp_dir)
# Define the directory and log file path
log_dir = f"./{proteome_file_name}_{run_name}.dir"
log_file_path = os.path.join(log_dir, f"{proteome_file_name}_{run_name}.log")
# Ensure the directory exists
os.makedirs(log_dir, exist_ok=True)
# Initialize log file and record start time
time_execution_start = datetime.datetime.now()
# Writing the starting entry to the log file
with open(log_file_path, 'w') as log_file:
log_file.write(f"Starting Processing Proteome: {proteome_file_name} on: {time_execution_start}\n")
# Function to append messages to the log file
def append_to_log(message):
with open(log_file_path, 'a') as log_file:
log_file.write(message + "\n")
# Verifying Software Dependency Existence
append_to_log(f"Verifying Software Dependency Existence on: " + datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"))
# Determining Current Computer Platform
osname = platform.system()
cputype = platform.machine()
plt = ""
if osname == "Linux" and cputype == "x86_64":
plt = "Linux"
elif osname == "Darwin" and cputype == "x86_64":
plt = "Darwin"
elif osname == "Darwin" and "arm" in cputype:
plt = "Silicon"
elif "CYGWIN_NT" in osname or "MINGW" in osname:
plt = "CYGWIN_NT"
elif osname == "Linux" and "arm" in cputype:
plt = "ARM"
append_to_log(f"Operational System: {osname}")
append_to_log(f"CPU TYPE: {cputype}")
append_to_log(f"Detected Platform: {plt}")
# Determining Python Version (as a stand-in for checking Bash version)
python_version = platform.python_version()
required_version = "3.6" # Example required version
append_to_log(f"Available Python Version: {python_version}")
append_to_log(f"Required Minimal Python Version: {required_version}")
if version.parse(python_version) >= version.parse(required_version):
append_to_log(f"Python version {python_version} is installed.")
else:
append_to_log(f"Python version 3.6 or higher is not installed.")
append_to_log(f"Please install Python version 3.6 or higher.")
# Exit the script if the required Python version is not installed
sys.exit(1)
# Check for pepstats installation
def check_dependency(command):
#original_path = os.environ['PATH']
#os.environ['PATH'] += os.pathsep + '/opt/local/bin' # Ensure this is the correct path for pepstats
result = subprocess.run(["which", command], capture_output=True, text=True)
#os.environ['PATH'] = original_path # Reset PATH to original value
return result.returncode == 0
# Append dependency check results to log
if check_dependency("pepstats"):
append_to_log(f"PepStats is Installed")
else:
append_to_log(f"EMBOSS pepstats is Not Installed")
print (usage)
# Handle the error as needed, perhaps by calling a custom `func_usage` function or exiting
sys.exit("Please install EMBOSS PepStats.")
# Logging platform information
append_to_log(f"Software Dependencies Verified on: " + datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"))
append_to_log(f"Script Running on: {osname}, {cputype}")
# Setting LC_ALL to "C"
os.environ["LC_ALL"] = "C" # Affects child processes
locale.setlocale(locale.LC_ALL, 'C') # Affects the current Python process
# Writing command issued and other details to the log
append_to_log(f"\nCommand Issued Was: {script_name} -p {proteome} -r {run_name} -z {tmp_dir}")
append_to_log(f"\tProteome Analyzed:\t{proteome_file_name}")
append_to_log(f"\tFile Type:\t\tProteome")
# Conditionally writing TMPDIR information based on tmp_dir
if tmp_dir == 0:
append_to_log(f"\tTMPDIR Requested:\tNormal Run")
append_to_log(f"\tTMPDIR Created at:\t{var_script_tmp_data_dir}")
else:
append_to_log(f"\tTMPDIR Requested:\tLocal TMPDIR Run")
append_to_log(f"\tTMPDIR Created at:\t{var_script_tmp_data_dir}")
# Construct the output file path
output_file_path = os.path.join(var_script_tmp_data_dir, f"001_{proteome}.out")
# Check if the file does not exist or is empty
if not os.path.isfile(output_file_path) or os.path.getsize(output_file_path) == 0:
# Construct the pepstats command
pepstats_command = [
"pepstats",
"-sequence", f"{proteome_file_name}",
"-outfile", output_file_path
]
# Execute the command, suppressing stderr
with open(os.devnull, 'w') as devnull:
subprocess.run(pepstats_command, stderr=devnull)
# File and file paths
file_001 = os.path.join(var_script_tmp_data_dir, f"001_{proteome}.out")
file_002 = os.path.join(var_script_tmp_data_dir, f"002_{proteome}.out")
file_003 = os.path.join(var_script_tmp_data_dir, f"003_{proteome}.out")
file_004 = os.path.join(var_script_tmp_data_dir, f"004_{proteome}.out")
file_005 = os.path.join(var_script_tmp_data_dir, f"005_{proteome}.out")
file_006 = os.path.join(var_script_tmp_data_dir, f"006_{proteome}.out")
file_007 = os.path.join(var_script_tmp_data_dir, f"007_{proteome}.out")
file_008 = os.path.join(var_script_tmp_data_dir, f"008_{proteome}.out")
# Define regex patterns for each condition
patterns = [
(re.compile(r"PEPSTATS"), 3),
(re.compile(r"Molecular weight"), 4),
(re.compile(r"Isoelectric Point ="), -1),
(re.compile(r"A = Ala"), 5),
(re.compile(r"C = Cys"), 4),
(re.compile(r"D = Asp"), 4),
(re.compile(r"E = Glu"), 4),
(re.compile(r"F = Phe"), 4),
(re.compile(r"G = Gly"), 4),
(re.compile(r"H = His"), 4),
(re.compile(r"A = Ala"), 4),
(re.compile(r"I = Ile"), 4),
(re.compile(r"K = Lys"), 4),
(re.compile(r"L = Leu"), 4),
(re.compile(r"M = Met"), 4),
(re.compile(r"N = Asn"), 4),
(re.compile(r"P = Pro"), 4),
(re.compile(r"Q = Gln"), 4),
(re.compile(r"R = Arg"), 4),
(re.compile(r"S = Ser"), 4),
(re.compile(r"T = Thr"), 4),
(re.compile(r"V = Val"), 4),
(re.compile(r"W = Trp"), 4),
(re.compile(r"Y = Tyr"), 4),
(re.compile(r"Tiny"), -1),
(re.compile(r"Small"), -1),
(re.compile(r"Aliphatic"), -1),
(re.compile(r"Aromatic"), -1),
(re.compile(r"Non-polar"), -1),
(re.compile(r"Polar"), -1),
(re.compile(r"Charged"), -1),
(re.compile(r"Basic"), -1),
(re.compile(r"Acidic"), -1),
]
nl = False
with open(file_001, 'r') as input_file, open(file_002, 'w') as output_file:
for line in input_file:
for pattern, field_index in patterns:
if pattern.search(line):
words = line.split()
if nl:
output_file.write("\n")
else:
nl = True
# Handle the case where field_index is -1 (i.e., use the last field)
if field_index == -1:
field_index = len(words)
output_file.write(f"{words[field_index - 1]} ")
break # Move to the next line after a match
# Print a final newline at the end of the file
output_file.write("\n")
# Format and combine data from file_002 and file_001 into file_003
# Number of lines in each entry, including the identifier line
lines_per_entry = 32
def process_file(file_002, file_003):
with open(file_002, 'r') as input_file, open(file_003, 'w') as output_file:
# Initialize a counter to track the number of lines processed for the current entry
line_count = 0
# Initialize a list to accumulate lines for the current entry
current_entry_lines = []
for line in input_file:
# Strip leading/trailing whitespace
line = line.strip()
# Skip empty lines if they are not expected within an entry
if not line:
continue
# Add the current line to the entry's lines
current_entry_lines.append(line)
line_count += 1
# Check if we've accumulated all lines for the current entry
if line_count == lines_per_entry:
# Join the lines with a tab separator and write to the output file
output_file.write('\t'.join(current_entry_lines) + '\n')
# Reset the counter and the list for the next entry
line_count = 0
current_entry_lines = []
# Handle the last entry if the file doesn't end with a full set of lines_per_entry
# This step may be unnecessary if each entry is guaranteed to have exactly lines_per_entry lines
if current_entry_lines:
output_file.write('\t'.join(current_entry_lines) + '\n')
process_file(file_002, file_003)
# Create a header row and write it to file_004
header = "Protein_ID\tMolecular_weight\tIsoelectric_Point\t" \
"Mole%_Ala\tMole%_Cys\tMole%_Asp\tMole%_Glu\tMole%_Phe\tMole%_Gly\tMole%_His\tMole%_Ile\tMole%_Lys\tMole%_Leu\t" \
"Mole%_Met\tMole%_Asn\tMole%_Pro\tMole%_Gln\tMole%_Arg\tMole%_Ser\tMole%_Thr\tMole%_Val\tMole%_Trp\tMole%_Tyr\t" \
"Mole%_Tiny\tMole%_Small\tMole%_Aliphatic\tMole%_Aromatic\tMole%_Non-polar\tMole%_Polar\tMole%_Charged\tMole%_Basic\tMole%_Acidic\n"
with open(file_004, 'w') as f_header:
f_header.write(header)
# Concatenate file_004 and file_003 into file_005
with open(file_005, 'w') as f_final, open(file_004, 'r') as f_header, open(file_003, 'r') as f_data:
f_final.write(f_header.read())
f_final.write(f_data.read())
def transpose_entries(file_002, file_006, lines_per_entry):
# Read all lines from the input file and group them
with open(file_002, 'r') as file:
lines = file.read().strip().split('\n')
# Group every lines_per_entry lines into an entry
entries = [lines[i:i + lines_per_entry] for i in range(0, len(lines), lines_per_entry)]
# Transpose the list of entries
transposed = list(zip(*entries))
# Write the transposed lines to the output file, separating entries with tabs
with open(file_006, 'w') as file:
for line_group in transposed:
file.write('\t'.join(line_group) + '\n')
transpose_entries(file_002, file_006, lines_per_entry)
# Create a column row and write it to file_007
column = "Protein_ID\nMolecular_weight\nIsoelectric_Point\n" \
"Mole%_Ala\nMole%_Cys\nMole%_Asp\nMole%_Glu\nMole%_Phe\nMole%_Gly\nMole%_His\nMole%_Ile\nMole%_Lys\nMole%_Leu\n" \
"Mole%_Met\nMole%_Asn\nMole%_Pro\nMole%_Gln\nMole%_Arg\nMole%_Ser\nMole%_Thr\nMole%_Val\nMole%_Trp\nMole%_Tyr\n" \
"Mole%_Tiny\nMole%_Small\nMole%_Aliphatic\nMole%_Aromatic\nMole%_Non-polar\nMole%_Polar\nMole%_Charged\nMole%_Basic\nMole%_Acidic\n"
with open(file_007, 'w') as f_column:
f_column.write(column)
# Open file_008 for writing, and file_007 and file_006 for reading
with open(file_008, 'w') as f_final, open(file_007, 'r') as f_header, open(file_006, 'r') as f_data:
# Iterate over the lines of file_007 and file_006 simultaneously
for header_line, data_line in zip(f_header, f_data):
# Strip newline characters and concatenate the lines with a tab in between
combined_line = header_line.strip() + '\t' + data_line.strip() + '\n'
# Write the combined line to file_008
f_final.write(combined_line)
main_pepstats_run = os.path.join(log_dir, f"{proteome_file_name}_{run_name}.00_Main_PepStats_Analysis")
table_01_pepstats_run = os.path.join(log_dir, f"{proteome_file_name}_{run_name}.01_PepStats_Table_01")
table_02_pepstats_run = os.path.join(log_dir, f"{proteome_file_name}_{run_name}.02_PepStats_Table_02")
shutil.move(file_001, main_pepstats_run)
shutil.move(file_005, table_01_pepstats_run)
shutil.move(file_008, table_02_pepstats_run)
# Remove the TMPDIR directory and all its contents
shutil.rmtree(var_script_tmp_data_dir)
# Record stop time and calculate duration
time_execution_stop = datetime.datetime.now()
execution_duration_seconds = (time_execution_stop - time_execution_start).total_seconds()
execution_duration_minutes = execution_duration_seconds / 60
execution_duration_hours = execution_duration_minutes / 60
# Writing the closing entry to the log file
with open(log_file_path, 'a') as log_file: # Open in append mode to add to the file
log_file.write(f"\nFinishing Processing proteome {proteome} on: {time_execution_stop}\n")
log_file.write(f"Script Runtime: {execution_duration_seconds} seconds\n")
log_file.write(f"Script Runtime: {execution_duration_minutes:.2f} minutes\n")
log_file.write(f"Script Runtime: {execution_duration_hours:.2f} hours\n")
if __name__ == "__main__":
main()