-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathscore_substrate_similarities_positionwise.py
137 lines (116 loc) · 5.07 KB
/
score_substrate_similarities_positionwise.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
# Aligns each combination of recognition sites for
# each protease using a custom static alignment
# method that checks each position with each corresponding
# position(x1 - y1, x2 - y2, ..., x8 - y8) to measure
# the degree of conservation of each site.
# Scoring is based on the BLOSUM62 table.
import json
import pandas as pd
from itertools import combinations
import multiprocessing as mp
# Load the data in
with open("data/substrates_data.json", "r") as in_file_name:
substrates_dict = dict(json.load(in_file_name))
# Initialize new dicts
one_substrate_proteases = []
mul_substrate_proteases = {}
# Separate proteases with only one substrate
for k, v in substrates_dict.items():
if len(v) == 1:
one_substrate_proteases.append(k)
else:
mul_substrate_proteases[k] = v
# Set cpus to be used, we will split the data in a list of int(mp.cpu_count() / 1.25) smaller dicts
cpus = int(mp.cpu_count() / 1.25)
# Initialize the list of smaller dicts
smaller_dicts = [{} for _ in range(cpus)]
# Keep track of current required operations for each index of the smaller_dicts list
current_ops_count = [0] * cpus
# Loop over k: v pairs of proteases
for k, v in mul_substrate_proteases.items():
# Calculate required operations for this protease
op_count = int((len(v) * (len(v) - 1)) / 2)
# Check which index has the least operations assigned
min_ops_index = current_ops_count.index(min(current_ops_count))
# Add the smaller_dict to the index with the least operations
smaller_dicts[min_ops_index][k] = v
# Update the list of operations for each index
current_ops_count[min_ops_index] += op_count
# Print some general data to quickly check that everythin is running smoothly
for i, d in enumerate(smaller_dicts):
print(
f"Smaller Dictionary {i+1}\tn. of operations:\t{sum([((len(v) * len(v) - 1) / 2) for v in d.values()])}")
print("Missing data:",
len(substrates_dict) - sum([len(d.values())
for d in smaller_dicts]) - len(one_substrate_proteases))
# Blosum62 table data
blosum62df = pd.read_csv("data/blosum62.csv", index_col=0)
def measure_similarity(return_list, blosum62, smaller_dict):
"""
Function to measure how similar the substrates are for each protease.
Each sequence is aligned against each other sequence and their average
score is calculated and returned in a dictionary. This is the function
that will be passed to the mp manager.
"""
# Initialize empty list
l = []
# Loop over each protease - substrate sequences list pair
# and calculate its score positionwise
for protease, seqs_list in smaller_dict.items():
print(f"working on: {protease}...")
protease_scores = []
for seq1, seq2 in combinations(seqs_list, 2):
position_scores = []
for aa1, aa2 in zip(seq1, seq2):
position_scores.append(blosum62.loc[aa1, aa2])
protease_scores.append(sum(position_scores) / 8)
score = sum(protease_scores) / len(protease_scores)
l.append({protease: score})
return_list.append(l)
# Multiprocessing magic, calculate the score for each protease in parallel
with mp.Manager() as manager:
# Initialize a return list for appending the resulting dicts
return_list = manager.list()
# Start processing in parallel using the same number of cpus calculated above
pool = mp.Pool(processes=cpus)
pool.starmap(
measure_similarity, [
(return_list, blosum62df, smaller_dict) for smaller_dict in smaller_dicts
]
)
pool.close()
pool.join()
# Conver the resulting list to a proper list, before closing the Manager()
result = list(return_list)
def flatten_dict_list(nested_list):
"""
Recursive function to flatten the output list and make it a dict
of {protease: score} pairs
"""
flattened_dict = {}
for item in nested_list:
# Loop over each item in the nested list
if isinstance(item, dict):
# If its a dict, add it to the final flattened dict
flattened_dict.update(item)
elif isinstance(item, list):
# If its a list, call the function again
flattened_dict.update(flatten_dict_list(item))
return flattened_dict
# Flatten the nested list and convert to a single dictionary, then order
result_dict = flatten_dict_list(result)
result_dict = dict(sorted(result_dict.items(), key=lambda x: x[1]))
# Normalize the values between 0 and 1
max_val = max(result_dict.values())
min_val = min(result_dict.values())
max_minus_min = max(result_dict.values()) - min(result_dict.values())
for k, v in result_dict.items():
# z_i = (x_i – min(x)) / (max(x) – min(x))
result_dict[k] = (v - min_val) / (max_minus_min)
# Put both dictionaries in one for saving
final_dict = {}
final_dict["Proteases with more than one substrate"] = result_dict
final_dict["Proteases with only one substrate"] = one_substrate_proteases
# Dump in a json file
with open("data/scored_substrates_positionwise.json", "w") as out_file_name:
json.dump(final_dict, out_file_name, sort_keys=True, indent=4)