-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcombine_sources.py
1358 lines (1213 loc) · 73.5 KB
/
combine_sources.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
# Name: combine_sources.py
# Language: python3
# Author: Edoardo Sarti
# Date: Apr 03 2022
from supporting_functions import *
def pdb_query(path="pdb_query.json"):
"""Returns the json query for the PDB API (2021-)
"""
with open(path) as pf:
return pf.read()
def check_update_repo(options, locations, record=False,
thr_log_status='ERROR'):
"""Determines whether the repository must be updated (i.e. new structures
must be retrieved)
"""
this_name = check_update_repo.__name__
# Options and locations aliases
OPT_download_date_threshold =\
int(options['RUN'][('', 'download_date_threshold')]) # Days from last update
OPT_force_update = options['RUN'][('fud', 'force_update')]
LOC_OPMlastddate = locations['SYSFILES']['OPMlastddate']
LOC_main = locations['FSYSPATH']['main']
# Checks current time and date
update_repo = False
now_datetime = datetime.datetime.now()
now_timestamp = time.mktime(now_datetime.timetuple()) # Epoch time in seconds
# If "force update" option is present, return True
if OPT_force_update:
print_log((
'NOTICE',
this_name,
f"Force update contents of the folder {LOC_main}"))
update_repo = True
# If time from last update > download_date_threshold option, return True
elif os.path.exists(LOC_OPMlastddate):
# Time span conversion into days
with open(LOC_OPMlastddate) as ldd_file:
for line in ldd_file:
last_download_timestamp = line.split()[0] # Epoch time in seconds
len_ldt = len(last_download_timestamp)
last_download_datetime = line[len_ldt-1:].strip()
time_span_days = int((int(last_download_timestamp) - int(now_timestamp))
/(24*3600))
# If time from last update > download_date_threshold opt, return True
if time_span_days > OPT_download_date_threshold:
print_log((
'NOTICE',
this_name,
("Update contents of the folder {0}\nLast update: {1}"
"\nDeadline: {2} days"
).format(
LOC_main,
last_download_datetime,
OPT_download_date_threshold)
))
update_repo = True
# Else, it must be the first update.
else:
print_log((
'NOTICE',
this_name,
("First update contents of the folder {0} (or file {1} "
"not found)").format(
LOC_main,
LOC_OPMlastddate)
))
update_repo = True
# If the date is not recorded, the update will be issued again next time
if record:
print_log((
'NOTICE',
this_name,
"Update complete, new date recorded"))
with open(LOC_OPMlastddate, 'w') as repo_date_file:
repo_date_file.write("{0} {1}\n".format(
int(now_timestamp),
now_datetime.strftime('%Y-%m-%d %H:%M:%S')))
return update_repo
def mainlist_from_OPM(options, locations, str_data={},
use_pkl=False, # Deprecated?
thr_log_status='ERROR'):
"""Returns the list of primary and secondary OPM entries to be considered
during this instance of EncoMPASS
"""
this_name = mainlist_from_OPM.__name__
# Options and locations aliases
OPT_from_repo = options['RUN'][('repo', 'from_repo')]
OPT_offline = options['RUN'][('ol', 'offline')]
OPT_sample_structures = options['EXECUTION'][('sample', 'sample_structures')]
OPT_debug = options['EXECUTION'][('debug', 'debug')]
LOC_OPMjsons = locations['FSYSPATH']['OPMjsons']
LOC_OPMreprchart = locations['SYSFILES']['OPMreprchart']
# Statistics
local_stats = {
'primary' : [],
'secondary' : [],
'associated_secondary' : [],
'selected_primary' : [],
'selected_secondary' : [],
'temp_primary' : [],
'sample_not_found' : []
}
# Check valid option setting
if OPT_offline and (not OPT_from_repo):
print_log((
'CRITICAL',
this_name,
("Offline option is on: please use from_repo option to specify a "
"local repository where to take the OPM entry list from")
))
# Check whether OPM must be queried
update_repo = check_update_repo(options, locations)
# Download/retrieve main json file with PDB codes of primary structures
mainjson_filename = LOC_OPMjsons + 'main.json'
repofilename = OPT_from_repo + LOC_OPMjsons + 'main.json'
if OPT_from_repo and os.path.exists(repofilename):
shutil.copyfile(repofilename, mainjson_filename)
else:
res = requests.get(
# The .../types/1/... specifies this is the list for type-1 OPM structures (Transmembrane)
"https://lomize-group-opm.herokuapp.com/types/1/primary_structures?pageSize=100000",
params={},
headers={"Accept" : "application/json"}
)
with open(mainjson_filename, 'w') as f:
data = res.json()
f.write(json.dumps(data))
local_stats['primary'] = [x['pdbid'] for x in data['objects']]
if not os.path.exists(mainjson_filename):
print_log((
'CRITICAL',
this_name,
"Could not retrieve the list of primary OPM structures"
))
# Extract primary entry info from json file
if OPT_debug:
print("Reading JSON file", mainjson_filename)
with codecs.open(mainjson_filename, 'r+', encoding="utf-8") as mainjson_f:
data = json.load(mainjson_f, encoding='utf-8')
ids = [(x['id'], x['pdbid'], False) for x in data['objects']] # Primary (<int>ID, <str>PDBID, <bool>is_temp) list
ids_d = {x['id'] : x['pdbid'] for x in data['objects']} # Primary <int>ID-><str>PDBID dict (internal use)
revids_d = {x['pdbid'] : x['id'] for x in data['objects']} # Primary <str>PDBID-><int>ID dict (internal use)
# Download/retrieve secondary structure json file
secjson_filename = LOC_OPMjsons + 'secondary.json'
repofilename = OPT_from_repo + LOC_OPMjsons + 'secondary.json'
if OPT_from_repo and os.path.exists(repofilename):
shutil.copyfile(repofilename, secjson_filename)
else:
if not OPT_offline:
res = requests.get(
"https://lomize-group-opm.herokuapp.com/secondary_representations?pageSize=1000000",
params={},
headers={"Accept" : "application/json"}
)
with open(secjson_filename, 'w') as f:
data = res.json()
f.write(json.dumps(data))
local_stats['secondary'] = [x['pdbid'] for x in data['table']['objects']]
if not os.path.exists(secjson_filename):
print_log((
'CRITICAL',
this_name,
"Could not retrieve the list of secondary OPM structures"
))
# Extract secondary entries info from json file
with codecs.open(secjson_filename, 'r+', encoding="utf-8") as secjson_f:
data = json.load(secjson_f, encoding='utf-8')
secrep = [] # Secondary <str>PDBID list (internal use)
secrep_d = {} # Secondary <str>PDBID->[<str>PDBID, ...] dict
revsecrep_d = {} # Secondary <str>PDBID-><int>ID dict (internal use)
for x in data['table']['objects']:
if x['primary_structure_id'] not in ids_d:
#print("CONT", x['primary_structure_id'])
continue
if ids_d[x['primary_structure_id']] not in secrep_d:
secrep_d[ids_d[x['primary_structure_id']]] = []
if x['pdbid'] not in local_stats['associated_secondary']:
local_stats['associated_secondary'].append(x['pdbid'])
secrep.append(x['pdbid'])
secrep_d[ids_d[x['primary_structure_id']]].append(x['pdbid'])
revsecrep_d[x['pdbid']] = x['primary_structure_id']
# Only choose the structures in the sample list. Primary structures
# corresponding to cited secondary ones will be added and then removed
# after the OPM data retrieval
if OPT_sample_structures:
new_ids = [] # Primary (<int>ID, <str>PDBID, <bool>is_temp) list
new_secrep_d = {} # Secondary <int>ID->[<str>PDBID, ...] dict
ids1 = [x[1] for x in ids]
for entry in OPT_sample_structures:
if entry[:4] in ids1:
# If a temporary primary entry was added because of an orphan secondary entry,
# but later in the list the same entry is read as a rightful entry, switch off
# the is_temp bit
if (revids_d[entry[:4]], entry[:4], True) in new_ids:
idx = new_ids.index((revids_d[entry[:4]], entry[:4], True))
new_ids[idx] = (revids_d[entry[:4]], entry[:4], False)
else:
new_ids.append((revids_d[entry[:4]], entry[:4], False))
# If it is a "+" entry, add all the secondary representations
if len(entry) == 5 and entry[4] == "+":
if entry[:4] in secrep_d: # entry is absent if it does not have secondary reprs
new_secrep_d[entry[:4]] = secrep_d[entry[:4]]
else:
print_log((
'WARNING',
this_name,
("Entry {0} was included in sample_structures, yet "
"{1} does not have secondary representations")\
.format(entry, entry[:4])
))
elif entry[:4] in secrep:
repr_id = revsecrep_d[entry[:4]]
# Add the primary entry corresponding to the orphan secondary entry
if (repr_id, ids_d[repr_id], False) not in new_ids:
# This entry is only temporary and should not be considered as an EncoMPASS entry
new_ids.append((repr_id, ids_d[repr_id], True))
if repr_id not in new_secrep_d:
new_secrep_d[ids_d[repr_id]] = []
new_secrep_d[ids_d[repr_id]].append(entry[:4])
else:
local_stats['sample_not_found'].append(entry)
local_stats['temp_primary'] = [x[1] for x in new_ids if x[2] is True]
ids = new_ids
secrep_d = new_secrep_d
local_stats['selected_primary'] = [x[1] for x in ids]
for x in secrep_d:
local_stats['selected_secondary'] += secrep_d[x]
# Write representation chart of all the entries considered in this EncoMPASS repository
write_OPM_representation_chart(ids, secrep_d, LOC_OPMreprchart)
# Show statistics
for x in sorted(local_stats):
print("STATS", "mainlist_from_OPM", x, len(local_stats[x]), local_stats[x])
print("STATS", "mainlist_from_OPM", "Finished", "time", time.ctime(), "\n")
# List of primary entries [(<int>ID, <str>PDBID, <bool>is_temp), ...] and dictionary <str>PDBID->[<str>PDBID, ...] linking each primary ID to list of secondary PDBIDs
return ids, secrep_d
def scan_OPM(options, locations, entries, str_data={}, use_pkl=False,
thr_log_status='ERROR'):
this_name = scan_OPM.__name__
# Options and locations aliases
OPT_dst = options['PATHS'][('dst', 'data_structure_template')]
OPT_from_repo = options['RUN'][('repo', 'from_repo')]
OPT_offline = options['RUN'][('ol', 'offline')]
OPT_debug = options['EXECUTION'][('debug', 'debug')]
OPT_sample_structures =\
options['EXECUTION'][('sample', 'sample_structures')]
LOC_OPM = locations['FSYSPATH']['OPM']
LOC_OPMjsons = locations['FSYSPATH']['OPMjsons']
LOC_OPMpdbs = locations['FSYSPATH']['OPMpdbs']
LOC_UniProt = locations['FSYS']['UniProt']
LOC_uniprot_all = locations['SYSFILES']['uniprot_all']
LOC_OPMreprchart = locations['SYSFILES']['OPMreprchart']
# Statistics
local_stats = {
'json_not_found' : [],
'no_segments' : [],
'no_alphabeta' : [],
'OPMpdb_not_found' : []
}
# Update repository
update_repo = check_update_repo(options, locations)
# Download/retrieval of OPM infos about primary entries
print("scan_OPM", "Download json files", "time", time.ctime())
primary_structures_data = {}
for i, pdbi, is_temp in sorted(entries, key=lambda x:x[1]):
if OPT_debug:
print(pdbi)
json_filename = LOC_OPMjsons + pdbi + '.json'
repofilename = OPT_from_repo + LOC_OPMjsons + pdbi + '.json'
if OPT_from_repo and os.path.exists(repofilename):
shutil.copyfile(repofilename, json_filename)
else:
if ((not OPT_offline) and
(update_repo or (not os.path.exists(json_filename)))):
res = requests.get(
f"https://lomize-group-opm.herokuapp.com/primary_structures/{i}",
params={},
headers={"Accept" : "application/json"}
)
time.sleep(0.1)
if not os.path.exists(json_filename):
with open(json_filename, 'w') as f:
f.write(json.dumps(res.json()))
if not os.path.exists(json_filename):
print_log((
'ERROR',
this_name,
f"Could not find file {json_filename}"
))
local_stats['json_not_found'].append(pdbi)
else:
with codecs.open(json_filename, 'r+', encoding='utf-8')\
as json_file:
primary_structures_data[(i, pdbi, is_temp)] = json.load(json_file,
encoding='utf-8')
# Download OPM coordinate files
print("scan_OPM", "Download coordinate files", "time", time.ctime())
for i, pdbi, is_temp in sorted(primary_structures_data, key=lambda x:x[1]):
if options['EXECUTION'][('debug', 'debug')]:
print(pdbi)
if pdbi not in str_data:
str_data[pdbi] = define_str_data_entry(OPT_dst)
# Update status
str_data[pdbi]['status'].append('initial_opm')
# EncoMPASS name is set to OPM name
str_data[pdbi]['ENCOMPASS']['name'] =\
primary_structures_data[(i, pdbi, is_temp)]['name']
str_data[pdbi]['PASSPORT'].append(passport_entry(
this_name+'_encname_opm',
pdbi,
"The name of this structure follows the OPM nomenclature"
))
# Download/retrieve the OPM coordinate files
pdb_path = LOC_OPMpdbs + pdbi + '.pdb'
repofilename = OPT_from_repo + LOC_OPMpdbs + pdbi + '.pdb'
if OPT_from_repo and os.path.exists(repofilename):
shutil.copyfile(repofilename, pdb_path)
else:
if not os.path.exists(pdb_path) and not OPT_offline:
download_result = reiterated_simple_download(
f'https://opm-assets.storage.googleapis.com/pdb/{pdbi}.pdb',
pdb_path
)
# Remove 0-segment structures
if int(primary_structures_data[(i, pdbi, is_temp)]['subunit_segments']) == 0:
str_data[pdbi]['PASSPORT'].append(passport_entry(
this_name+'_zero_tmsegments',
pdbi,
("According to OPM, this structure has 0 TM segments, i.e. "
"it is not a transmembrane protein structure. The entry "
"will not be considered in EncoMPASS.")
))
str_data[pdbi]['status'].append('eliminated')
str_data[pdbi]['delete_keyword'] = 'Monotopic'
local_stats['no_segments'].append(pdbi)
# Sort attributes
for x in str_data[pdbi]['FROM_OPM']:
# Copy all superfamily sub-attributes
if 'superfamily' in x:
for y in str_data[pdbi]['FROM_OPM']['superfamily']:
str_data[pdbi]['FROM_OPM']['superfamily'][y] =\
primary_structures_data[(i, pdbi, is_temp)]['family']['superfamily'][y]
# Class renaming: alpha-beta-other
# TO DO add a statistic here
elif 'class' in x:
classtype = primary_structures_data[(i, pdbi, is_temp)]['family']\
['superfamily']['classtype']['name']
str_data[pdbi]['FROM_OPM']['classtype'] =\
primary_structures_data[(i, pdbi, is_temp)]['family']\
['superfamily']['classtype']['name']
if classtype == 'Alpha-helical polytopic':
enc_class = 'alpha'
elif classtype == 'Beta-barrel transmembrane':
enc_class = 'beta'
else:
local_stats['no_alphabeta'].append(pdbi)
enc_class = 'other'
str_data[pdbi]['ENCOMPASS']['class'] = enc_class
str_data[pdbi]['PASSPORT'].append(passport_entry(
this_name+'_encclass_opm',
pdbi,
("This structure is classified according to the OPM "
"topology classification, in the category: {0}"
).format(enc_class)
))
# Check whether the OPM coordinate file is there
elif x == 'coordinate_file':
if os.path.exists(pdb_path):
str_data[pdbi]['FROM_OPM']['coordinate_file'] = pdb_path
else:
local_stats['OPMpdb_not_found'].append(pdbi)
print_log((
'ERROR',
this_name,
"OPM pdb file {0} was not found in {1}".format(
pdbi, locations['FSYSPATH']['OPMpdbs'])
))
if ((not str_data[pdbi]['status']) or
str_data[pdbi]['status'][-1] != 'opm_eliminated'):
str_data[pdbi]['status'].append('opm_eliminated')
if ((not str_data[pdbi]['status']) or
str_data[pdbi]['status'][-1] != 'run_ppm'):
str_data[pdbi]['status'].append('run_ppm')
# Copy other dictionaries
elif (isinstance(str_data[pdbi]['FROM_OPM'][x], FixedDict) and
x != 'analysis'):
for y in str_data[pdbi]['FROM_OPM'][x]:
str_data[pdbi]['FROM_OPM'][x][y] =\
primary_structures_data[(i, pdbi, is_temp)][x][y]
# Copy other lists
elif isinstance(str_data[pdbi]['FROM_OPM'][x], FixedList):
# If there are no secondary representation, cycle
if (x == 'secondary_representations' and
(not primary_structures_data[(i, pdbi, is_temp)][x])):
continue
# Chains as described in OPM
# The 'subunit' key in OPM json contains the infos regarding TM segments
if x == 'subunits':
for subunit in primary_structures_data[(i, pdbi, is_temp)][x]:
subunits_key = subunit['protein_letter']
str_data[pdbi]['FROM_OPM']['ksubunits'].append(
subunits_key)
tempd = FixedDict(
str_data[pdbi]['FROM_OPM']['subunits']\
.get_fdict_template()
)
for y in tempd:
# Fill OPM segments
if y == "segment":
tempd[y] = []
for seg in subunit[y].split('(')[1:]:
n = seg.split('-')
if len(n) == 2:
n1, n2 = seg.split('-')
n1 = int(''.join(n1.split()))
n2 = int(''.join(
n2.split(')')[0].split())
)
elif len(n) == 1:
n1, n2 = n[0][:n[0].index(')')].split()
tempd[y].append(tuple([n1, n2]))
#print("OPM SEGMENTS", pdbi, subunits_key, tempd[y])
else:
tempd[y] = subunit[y]
str_data[pdbi]['FROM_OPM']['subunits']\
.append(tempd)
#print("WRITE", tempd.show_dictionary())
# Otherwise, OPM json and the str_data have the same keys and data can be copied
else:
for il, l in enumerate(primary_structures_data[(i, pdbi, is_temp)][x]):
tempd = FixedDict(
str_data[pdbi]['FROM_OPM'][x].get_fdict_template())
for y in tempd:
tempd[y] = primary_structures_data[(i, pdbi, is_temp)][x][il][y]
str_data[pdbi]['FROM_OPM'][x].append(tempd)
# Copy other non-dictionary, non-list data
else:
if x in primary_structures_data[(i, pdbi, is_temp)]:
str_data[pdbi]['FROM_OPM'][x] =\
primary_structures_data[(i, pdbi, is_temp)][x]
# Check the primary representation case
str_data[pdbi]['FROM_OPM']['primary_representation'] = True
# Secondary representations
if len(str_data[pdbi]['FROM_OPM']['secondary_representations']) > 0:
# Loop on the codes found in the OPM json page
sec_pdbis = [x['pdbid'] for x in\
primary_structures_data[(i, pdbi, is_temp)]['secondary_representations']]
for sec_pdbi in sec_pdbis:
# If they are not among the codes to sample, cycle
if (OPT_sample_structures and
sec_pdbi not in OPT_sample_structures):
continue
# Retrieve the OPM coordinate file
sec_pdb_path = LOC_OPMpdbs + sec_pdbi + '.pdb'
repofilename = OPT_from_repo + LOC_OPMpdbs + sec_pdbi + '.pdb'
if OPT_from_repo and os.path.exists(repofilename):
shutil.copyfile(repofilename, sec_pdb_path)
else:
if not OPT_offline and not os.path.exists(sec_pdb_path):
download_result = reiterated_simple_download(
f'https://opm-assets.storage.googleapis.com/pdb/{sec_pdbi}.pdb',
sec_pdb_path
)
# Copy the str_data entry of the pdbi entry into sec_pdbi -> DANGEROUS: CHECK ALL PATHS
str_data[sec_pdbi] = FixedDict(str_data[pdbi])
if str_data[pdbi]['ENCOMPASS']['class'] == 'other':
local_stats['no_alphabeta'].append(sec_pdbi)
# OPM coordinate file analysis
if os.path.exists(sec_pdb_path):
str_data[sec_pdbi]['FROM_OPM']\
['coordinate_file'] = sec_pdb_path
str_data[sec_pdbi]['FROM_OPM']\
['primary_representation'] = False
# Add as secondary representations all other secondary
# representations of its primary representation except
# itself, plus the primary representation
str_data[sec_pdbi]['FROM_OPM']['secondary_representations'] =\
str_data[sec_pdbi]['FROM_OPM']\
.show_dictionary(quiet=True)['secondary_representations']
for iixx, xx in enumerate(str_data[sec_pdbi]['FROM_OPM']\
['secondary_representations']):
if xx['pdbid'] == sec_pdbi:
iitodel = iixx
break
del(str_data[sec_pdbi]['FROM_OPM']\
['secondary_representations'][iitodel])
elm = str_data[sec_pdbi]['FROM_OPM']\
['secondary_representations'].get_fdict_template()
for ii, dd, tt in entries:
if dd == pdbi:
elm['id'] = ii
break
elm['pdbid'] = pdbi
str_data[sec_pdbi]['FROM_OPM']\
['secondary_representations'].append(FixedDict(elm))
else:
local_stats['OPMpdb_not_found'].append(sec_pdbi)
print_log((
'ERROR',
this_name,
"OPM coordinate file {0} was not found in {1}"\
.format(sec_pdbi, LOC_OPMpdbs)
))
if ((not str_data[sec_pdbi]['status']) or
str_data[sec_pdbi]['status'][-1] != 'opm_eliminated'):
str_data[sec_pdbi]['status'].append('opm_eliminated')
if ((not str_data[sec_pdbi]['status']) or
str_data[sec_pdbi]['status'][-1] != 'run_ppm'):
str_data[sec_pdbi]['status'].append('run_ppm')
# If it does not have subunits, put in run_ppm list
if len(str_data[pdbi]['FROM_OPM']['subunits']) == 0:
str_data[pdbi]['status'].append('run_ppm')
# Remove primary entries that were added only in order to describe some orphan secondary entry
if is_temp:
del(str_data[pdbi])
print_log((
'NOTICE',
this_name,
"Entry {0} was only a proxy, it will be deleted".format(pdbi)
))
# If from_repo, copy uniprot data from repository
if OPT_from_repo:
repofilename = OPT_from_repo + LOC_UniProt + os.path.basename(LOC_uniprot_all)
print(repofilename, LOC_uniprot_all)
shutil.copyfile(repofilename, LOC_uniprot_all)
# Make pickle
str_data_pkl = LOC_OPM + '.str_data_after_OPM_read.pkl'
pickle.dump(str_data, open(str_data_pkl, 'wb'))
# Record current date
check_update_repo(options, locations, record=True)
# Show statistics
for x in sorted(local_stats):
print("STATS", "scan_OPM", x, len(local_stats[x]), local_stats[x])
print("STATS", "scan_OPM", "Finished", "time", time.ctime(), "\n")
return str_data
def scan_PDB(options, locations, str_data={}, pdb_list=[], use_pkl=False, thr_log_status='ERROR'):
"""scan_PDB
Collects information from the RCSB GraphQL API and records them in the data
structure (FROM_PDB key)
"""
this_name = scan_PDB.__name__
# Options and locations aliases
local_stats = {
'no_record' : [],
'no_str' : [],
'no_title' : [],
'no_expmethod' : [],
'no_resolution' : [],
'no_depo_date' : [],
'no_entities' : []
}
if not pdb_list:
pdb_list = sorted([x for x in str_data])
# Pickle
str_data_pkl = locations['FSYSPATH']['PDB'] + '.str_data_after_PDB_read.pkl'
if use_pkl and os.path.exists(str_data_pkl):
str_data = pickle.load(open(str_data_pkl, 'rb'))
# Obsolete PDB entries
obsolete = set()
obsolete_filename = locations['FSYSPATH']['PDBjsons'] + 'obsolete_entries.json'
# Get them from repo or server
repofilename = options['RUN'][('repo', 'from_repo')] + locations['FSYS']['PDBjsons'] + 'obsolete_entries.json'
if options['RUN'][('repo', 'from_repo')] and os.path.exists(repofilename):
shutil.copyfile(repofilename, obsolete_filename)
else:
pass
# Json info records
temparch = {} # Keys 'coordinate_file' and 'header' do not come from json but from structure
# pdbis -> batches
batch_size = 100
batches = []
with open(locations['FSYSPATH']['cache'] + 'PDBall.txt', 'w') as f:
for ipdbi, pdbi in enumerate(pdb_list):
if ipdbi%batch_size == 0:
batches.append([])
batches[-1].append(pdbi.upper())
f.write(pdbi.upper()+'\n')
# query by batches
url = 'https://data.rcsb.org/graphql?'
headers = []
print("scan_PDB", "Querying PDB by batches", "time", time.ctime())
for ibatch, batch in enumerate(batches):
if options['EXECUTION'][('debug', 'debug')]:
print("Batch", ibatch)
# Batch query
json_query = pdb_query(path=locations['SYSFILES']['pdbquery'])
json_query = json_query.replace("<PDBI>", ", ".join(['"'+x+'"' for x in batch]))
if options['EXECUTION'][('debug', 'debug')]:
print("JSON query file", json_query)
r = requests.post(url, json={'query': json_query})
js = json.loads(r.text)
if options['EXECUTION'][('debug', 'debug')]:
print("JSON data", js)
ebatch = []
for e in js['data']['entries']:
ebatch.append(e['rcsb_entry_container_identifiers']['entry_id'].lower())
# Download and parse individual structures
for pdbi in batch:
if options['EXECUTION'][('debug', 'debug')]:
print(pdbi)
if pdbi.lower() not in ebatch:
local_stats['no_record'].append(pdbi.lower())
continue
ipdbi = ebatch.index(pdbi.lower())
with open(locations['FSYSPATH']['PDBjsons'] + pdbi.lower() + '.json', 'w') as jf:
json.dump(js['data']['entries'][ipdbi], jf)
temparch[pdbi.lower()] = js['data']['entries'][ipdbi]
pdb_filename = locations['FSYSPATH']['PDBpdbs'] + pdbi.lower() + '.pdb'
mmcif_filename = locations['FSYSPATH']['PDBmmcifs'] + pdbi.lower() + '.cif'
# Download coordinate file
if options['RUN'][('mmcifmirror', 'mmcif_local_mirror')]:
mirrormmcif_fn = options['RUN']['mmcifmirror', 'mmcif_local_mirror'] + pdbi.upper() + '.cif'
mirrormmcif_fn2 = options['RUN']['mmcifmirror', 'mmcif_local_mirror'] + pdbi.lower() + '.cif'
if os.path.exists(mirrormmcif_fn):
shutil.copyfile(mirrormmcif_fn, mmcif_filename)
elif os.path.exists(mirrormmcif_fn2):
shutil.copyfile(mirrormmcif_fn2, mmcif_filename)
else:
str_data[pdbi]['PASSPORT'].append(passport_entry(this_name+'_no_pdbmirror', pdbi, "This entry was not found in the pdb mirror"))
elif options['RUN'][('pdbmirror', 'pdb_local_mirror')]:
mirrorpdb_fn = options['RUN']['pdbmirror', 'pdb_local_mirror'] + '/' + pdbi.lower()[1:3] + '/pdb' + pdbi.lower() + '.ent.gz'
if os.path.exists(mirrorpdb_fn):
shutil.copyfile(mirrorpdb_fn, pdb_filename+'.gz')
os.system('gunzip ' + pdb_filename + '.gz')
elif options['RUN'][('repo', 'from_repo')]:
repofilename = options['RUN'][('repo', 'from_repo')] + locations['FSYS']['PDBpdbs'] + pdbi.lower() + '.pdb'
if os.path.exists(repofilename):
shutil.copyfile(repofilename, pdb_filename)
else:
download_result = reiterated_simple_download('https://files.rcsb.org/download/{0}.cif'.format(pdbi.upper()), mmcif_filename)
download_result_2 = reiterated_simple_download('https://files.rcsb.org/download/{0}.pdb'.format(pdbi.upper()), pdb_filename)
# Parse header
if os.path.exists(mmcif_filename) or os.path.exists(mmcif_filename.replace(".gz", "")+"gz"):
temparch[pdbi.lower()]['coordinate_file'] = mmcif_filename
header, hreport = parse_mmCIF_header_wrap(mmcif_filename)
temparch[pdbi.lower()]['header'] = header
headers.append(temparch[pdbi.lower()]['header']) # USED NOW! For deciding alpha-beta-other class
elif os.path.exists(pdb_filename) or os.path.exists(pdb_filename.replace(".gz", "")+"gz"):
temparch[pdbi.lower()]['coordinate_file'] = pdb_filename
temparch[pdbi.lower()]['header'], hreport = parse_PDB_header(pdb_filename) # not used so far
else:
local_stats['no_str'].append(pdbi.lower())
mmCIF_header_stats(locations['FSYSPATH']['PDBstats'], headers)
# Fill FROM_PDB with info collected in temparch and with those in the PDB header
print("scan_PDB", "Fill data structure", "time", time.ctime())
for pdbi in pdb_list:
if options['EXECUTION'][('debug', 'debug')]:
print(pdbi)
if pdbi in local_stats['no_record']:
str_data[pdbi]['PASSPORT'].append(passport_entry(this_name+'_no_graphql', pdbi, "The GraphQL entry for {0} was not found. The PDB entry will be disregarded".format(pdbi)))
str_data[pdbi]['status'].append('pdb_eliminated')
continue
if pdbi in local_stats['no_str']:
str_data[pdbi]['PASSPORT'].append(passport_entry(this_name+'_no_coordfile', pdbi, "The PDB coordinate file for {0} was not found. The PDB entry will be disregarded".format(pdbi)))
str_data[pdbi]['status'].append('pdb_eliminated')
continue
from_pdb = FixedDict(str_data[pdbi]['FROM_PDB'].get_fdict_template()) # Template FROM_PDB structure
from_pdb['coordinate_file'] = temparch[pdbi.lower()]['coordinate_file']
if not from_pdb['coordinate_file']:
str_data[pdbi]['status'].append('pdb_eliminated')
from_pdb['ss_class'] = temparch[pdbi]['header']['ss_class']
if not str_data[pdbi]['status'] or str_data[pdbi]['status'][-1] != 'initial_pdb':
str_data[pdbi]['status'].append('initial_pdb')
# General info
from_pdb['pdb_id'] = pdbi
if temparch[pdbi]['struct'] is not None:
from_pdb['name'] = temparch[pdbi]['struct']['title']
else:
wnores_msg = ('WARNING', this_name, f"PDB json entry {pdbi} does not have the 'struct' key. Name will be set to None")
print_log(wnores_msg)
from_pdb['name'] = None
local_stats['no_title'].append(pdbi)
if temparch[pdbi]['exptl'] is not None:
from_pdb['experimental_method'] = ",".join(x['method'] for x in temparch[pdbi]['exptl'])
wnores_msg = ('WARNING', this_name, f"PDB json entry {pdbi} does not have the 'exptl' key. Experimental method will be set to None")
print_log(wnores_msg)
else:
from_pdb['experimental_method'] = None
local_stats['no_expmethod'].append(pdbi)
if temparch[pdbi]['refine']: # It is a list and can be empty
# If no 'refine', there are 3 keywords missing, not just 'PDB_resolution'
from_pdb['resolution'] = temparch[pdbi]['refine'][0]['ls_d_res_high']
else:
if temparch[pdbi]['header']['resolution']:
from_pdb['resolution'] = temparch[pdbi]['header']['resolution']
else:
local_stats['no_resolution'].append(pdbi)
wnores_msg = ('WARNING', this_name, f"PDB json entry {pdbi} does not have the 'refine' key. Resolution will be set to None")
print_log(wnores_msg)
from_pdb['resolution'] = None
if temparch[pdbi]['pdbx_database_status'] is not None:
print(temparch[pdbi]['pdbx_database_status'])
from_pdb['deposition_date'] = temparch[pdbi]['pdbx_database_status']['recvd_initial_deposition_date']
else:
local_stats['no_depo_date'].append(pdbi)
wnores_msg = ('WARNING', this_name, f"PDB json entry {pdbi} does not have the 'database_PDB_rev' key. Deposition date will be set to None")
print_log(wnores_msg)
from_pdb['deposition_date'] = None
# Polymers
new2oldch = {}
polychains = set()
alreadythere = set()
if temparch[pdbi]['polymer_entities'] is not None:
for ip, p in enumerate(temparch[pdbi]['polymer_entities']):
chains = temparch[pdbi]['polymer_entities'][ip]['entity_poly']['pdbx_strand_id']
for inst in p['polymer_entity_instances']:
if inst['rcsb_polymer_entity_instance_container_identifiers']['auth_asym_id'] in polychains: # HETATMs of the same chain are now considered a separate chain, we don't want to have them
continue
if inst['rcsb_polymer_entity_instance_container_identifiers']['auth_asym_id'] in chains:
new2oldch[inst['rcsb_polymer_entity_instance_container_identifiers']['asym_id']] = inst['rcsb_polymer_entity_instance_container_identifiers']['auth_asym_id']
polychains.add(inst['rcsb_polymer_entity_instance_container_identifiers']['auth_asym_id'])
else:
exit(1)
for ch in sorted(list(polychains)):
poly = FixedDict(from_pdb['polymers'].get_fdict_template())
poly['chain'] = ch
poly['sequence'] = temparch[pdbi]['polymer_entities'][ip]['entity_poly']['pdbx_seq_one_letter_code_can']
if type(temparch[pdbi]['polymer_entities'][ip]['rcsb_polymer_entity_container_identifiers']['uniprot_ids']) == list:
poly['uniprot_acc'] = ",".join(temparch[pdbi]['polymer_entities'][ip]['rcsb_polymer_entity_container_identifiers']['uniprot_ids'])
if type(temparch[pdbi]['polymer_entities'][ip]['rcsb_entity_source_organism']) == list:
poly['taxonomy'] = ",".join([x['ncbi_scientific_name'] if x['ncbi_scientific_name'] else "" for x in temparch[pdbi]['polymer_entities'][ip]['rcsb_entity_source_organism']])
from_pdb['polymers'].append(poly)
else:
local_stats['no_entities'].append(pdbi)
wnores_msg = ('WARNING', this_name, f"PDB json entry {pdbi} does not have the 'polymer_entities' key. Polymer section will be empty")
print_log(wnores_msg)
# Biological assembly
if temparch[pdbi]['assemblies'] is not None:
# Assemblies
# NB: an assembly is a way to form a complex. Different assemblies are found when different hypotheses on the
# biological assembly have been formulated. Usually, we consider the first assembly (but we record all)
oper_tr = FixedDict(from_pdb['translrots'].get_fdict_template())
for ib, b in enumerate(temparch[pdbi]['assemblies']):
ba = FixedDict(from_pdb['biological_assemblies'].get_fdict_template())
ba['method'] = temparch[pdbi]['assemblies'][ib]['pdbx_struct_assembly']['method_details']
ba['description'] = temparch[pdbi]['assemblies'][ib]['pdbx_struct_assembly']['details']
# Transformations
# NB: a transformation is a SET of matrices that applies on the SAME SET of chains. An assembly can have more than one transformation
# (such as 3dh4), because different matrices can apply to different chains and still contribute giving the same complex
for itr in range(len(temparch[pdbi]['assemblies'][ib]['pdbx_struct_assembly_gen'])):
tr = FixedDict(ba['transformations'].get_fdict_template())
tr['chains'] = [new2oldch[x] for x in temparch[pdbi]['assemblies'][ib]['pdbx_struct_assembly_gen'][itr]['asym_id_list'] if x in new2oldch] # List of chains(new name) for each assembly matrix)
oper_list, oper_keys = read_transf_matrices_json(temparch[pdbi]['assemblies'][ib]['pdbx_struct_oper_list'])
opexps = read_oper_expression(temparch[pdbi]['assemblies'][ib]['pdbx_struct_assembly_gen'][itr]['oper_expression'], oper_keys)
#print("READ_OPER_EXPRESSION", pdbi, ib, itr, temparch[pdbi]['assemblies'][ib]['pdbx_struct_assembly_gen'][itr]['oper_expression'], oper_keys, opexps)
# Rototranslations are individual pairs of rotation matrix and translation vectors
# They are the building blocks of operators
# NB: first the matrix will be applied, then the vector.
# NB2: in json files, translrots are listed as depending on each operator, but in mmCIF files they are not
# (i.e. their id is UNIQUE throughout the file, it does not reset at each operator change
# Here, we keep the more synthetic (and correct?) mmCIF notation
for ioper, oper in enumerate(oper_list):
op = FixedDict(from_pdb['translrots'].get_fdict_template())
op['matrix'] = oper['matrix']
op['vector'] = oper['vector']
# Append ONLY if ID is not already there
if oper_keys[ioper] not in from_pdb['ktranslrots']:
from_pdb['ktranslrots'].append(oper_keys[ioper])
from_pdb['translrots'].append(op)
# Operators are individual rototranslation functions. They can combine multiple rototranslators
# Example: [(M1,v1),(M2,v2)] is the function v -> (M2*((M1*v)+v1))+v2
# NB: there exists always M*,v* such that [(M1,v1),...,(Mn,vn)] = [(M*,v*)], but I don't know how to calculate it
tr['operators'] = opexps
ba['transformations'].append(tr)
from_pdb['biological_assemblies'].append(ba)
else:
ba = FixedDict(from_pdb['biological_assemblies'].get_fdict_template())
tr = FixedDict(ba['transformations'].get_fdict_template())
tr['chains'] = sorted(list(polychains))
tr['operators'] = [["1"]]
ba['transformations'].append(tr)
from_pdb['biological_assemblies'].append(ba)
op = FixedDict(from_pdb['translrots'].get_fdict_template())
op['matrix'] = [[1,0,0],[0,1,0],[0,0,1]]
op['vector'] = [0, 0, 0]
from_pdb['translrots'].append(op)
from_pdb['ktranslrots'].append("1")
str_data[pdbi]['FROM_PDB'] = from_pdb
for x in sorted(local_stats):
print("STATS", "scan_PDB", x, len(local_stats[x]), local_stats[x])
pickle.dump(str_data, open(str_data_pkl, 'wb'))
print("STATS", "scan_OPM", "Finished", "time", time.ctime(), "\n")
return str_data
def check_version(options, tvec, str_data_pdbi, pdbi, tm_chains=[], thr_log_status="ERROR"):
"""Checks that
"""
this_name = check_version.__name__
pdb_dict, report, prov = tvec
if options['EXECUTION'][('debug', 'debug')]:
print("check_version", pdbi, prov, "time", time.ctime())
hole_thr = options['PARAMETERS'][('', 'hole_thr')]
hole_frac_thr = options['PARAMETERS'][('', 'hole_frac_thr')]
disordered_thr = options['PARAMETERS'][('', 'disordered_thr')]
inter_persub_clashes_thr = options['PARAMETERS'][('', 'inter_persub_clashes_thr')]
intra_persub_clashes_thr = options['PARAMETERS'][('', 'intra_persub_clashes_thr')]
elim_tag, no_biopy, approval = prov + '_eliminated', prov + '_no_biopy', prov + '_approved'
if not pdb_dict or ('COORDS' in pdb_dict and not pdb_dict['COORDS']):
str_data_pdbi['PASSPORT'].append(passport_entry(this_name+'_no_coords', pdbi, "The coordinate file from the {0} database was not found".format(elim_tag[:3].upper())))
if not str_data_pdbi['status'] or str_data_pdbi['status'][-1] != elim_tag:
str_data_pdbi['status'].append(elim_tag)
return str_data_pdbi
# Check consistency with PDB format (if consensus_signature_PDB option is there)
if options['PARAMETERS'][('', 'consensus_signature_PDB')] and not report["pdb_format"]:
str_data_pdbi['PASSPORT'].append(passport_entry(this_name+'no_pdbfmt', pdbi, "Coordinate file from the {0} database cannot be translated in PDB format (common causes: >100000 atoms or multiple-character chain names). The coordinate file will not be considered in EncoMPASS"))
if not str_data_pdbi['status'] or str_data_pdbi['status'][-1] != elim_tag:
str_data_pdbi['status'].append(elim_tag)
# Chain-wise checks
for chain in sorted([x for x in pdb_dict['COORDS']]):
# Check for holes
if len(report['holes'][chain]) > hole_thr:
str_data_pdbi['PASSPORT'].append(passport_entry(this_name+'_chain_hole', pdbi, "Subunit {0} of the structure as recorded in the coordinate file from the {1} database contains at least one undetermined segment longer than {2} amino acids. The coordinate file is labeled as problematic.".format(chain, prov.upper(), hole_thr)))
if (not tm_chains) or (chain in tm_chains):
if not str_data_pdbi['status'] or str_data_pdbi['status'][-1] != elim_tag:
str_data_pdbi['status'].append(elim_tag)
# Check for holes (percent)
if report['holes_frac'][chain] > hole_frac_thr:
#print(pdbi, chain, report['holes'][chain])
str_data_pdbi['PASSPORT'].append(passport_entry(this_name+'_chain_holefrc', pdbi, "Subunit {0} of the structure as recorded in the coordinate file from the {1} database contains undetermined regions amounting to more than {2}% of its sequence length. The coordinate file is labeled as problematic.".format(chain, prov.upper(), hole_frac_thr*100)))
if (not tm_chains) or (chain in tm_chains):
if not str_data_pdbi['status'] or str_data_pdbi['status'][-1] != elim_tag:
str_data_pdbi['status'].append(elim_tag)
# Check for disordered regions
if len(report['disordered'][chain]) > disordered_thr:
str_data_pdbi['PASSPORT'].append(passport_entry(this_name+'_chain_disres', pdbi, 'Subunit {0} of the structure as recorded in the coordinate file from the {1} database contains disordered or repeated residue indexes. The coordinate file is labeled as problematic.'.format(chain, prov.upper())))
if (not tm_chains) or (chain in tm_chains):
if not str_data_pdbi['status'] or str_data_pdbi['status'][-1] != elim_tag:
str_data_pdbi['status'].append(elim_tag)
# Check for unknown residues (UNK or other strange types, which were included as UNK)
if report['UNK'][chain]:
if options['PARAMETERS'][('', 'with_UNK')]:
neg = ''
else:
neg = 'not'
str_data_pdbi['PASSPORT'].append(passport_entry(this_name+'_UNK_resnames', pdbi, 'Subunit {0} of the structure as recorded in the coordinate file from the {1} database contains unknown residues (labeled as UNK in the original coordinate file). In compliance with the EncoMPASS guidelines, these residues will {2} be kept into account in the model'.format(chain, prov.upper(), neg)))
# Check for chain-wise clashes
interctot = 0
for chain2 in sorted([x for x in pdb_dict['COORDS']]):
if chain == chain2:
continue
interctot += len(report['clashes'][(chain, chain2)])
if interctot > inter_persub_clashes_thr or len(report['clashes'][(chain, chain2)]) > intra_persub_clashes_thr:
if interctot > inter_persub_clashes_thr:
str_data_pdbi['PASSPORT'].append(passport_entry(this_name+'_interclash', pdbi, 'Subunit {0} of the structure as recorded in the coordinate file from the {1} database has too many steric clashes with the other subunits ({2} > {3})'.format(chain, prov.upper(), interctot, inter_persub_clashes_thr)))
else:
str_data_pdbi['PASSPORT'].append(passport_entry(this_name+'_interclash', pdbi, 'Subunit {0} of the structure as recorded in the coordinate file from the {1} database contains too many steric clashes ({2} > {3})'.format(chain, prov.upper(), len(report['clashes'][(chain, chain2)]), intra_persub_clashes_thr)))
if not str_data_pdbi['status'] or str_data_pdbi['status'][-1] != elim_tag:
str_data_pdbi['status'].append(elim_tag)
if elim_tag not in str_data_pdbi['status'] and no_biopy not in str_data_pdbi['status']:
str_data_pdbi['status'].append(approval)
# Check inter-chain errors
if report['merged_chains']:
str_data_pdbi['PASSPORT'].append(passport_entry(this_name+'_conch', pdbi, 'The structure as recorded in the coordinate file from the {0} database contains concatenating chains. Subunit names have been modified: {1}'.format(prov.upper(), " ".join([",".join([x for x in m[1:]])+" -> "+m[0] for m in report['merged_chains']]))))
# Check MSE->MET replacement
if report['mse_res']:
rep_str = ", ".join("chain {0} resID {1}{2}".format(ch, resid[0], resid[1]) for resid, ch in report['mse_res'])
str_data_pdbi['PASSPORT'].append(passport_entry(this_name+'_mse2met', pdbi, 'The following MSE residues were replaced by MET residues: ' + rep_str))
return str_data_pdbi
def check_whole_structures(data):
"""
Decides whether to use the OPM (preferred) or PDB structure, or to condemn the entry
"""
this_name = check_whole_structures.__name__
# Decompress input
options, locations, str_data, in_parallel, thr_log_status = data # in_parallel is a boolean that tells if this calculation is run in parallel
# Options and locations aliases
LOC_pad = locations['SYSFILES']['pdbatomdict']
tic = time.time()
new_OPM_data = {}
for pdbi in sorted([k for k in str_data]):
# Parses PDB as it is downloaded from the PDB
# with scan == True: clash control, hole control
# Filter
if 'eliminated' in str_data[pdbi]['status']:
continue
str_data[pdbi]['FROM_PDB']['cache_coordinate_file'] = locations['FSYSPATH']['TMPcoords'] + pdbi+'_PDB.cif'
str_data[pdbi]['FROM_OPM']['cache_coordinate_file'] = locations['FSYSPATH']['TMPcoords'] + pdbi+'_OPM.cif'
PDB_dict, PDB_report = parse_mmCIF_coords_wrap(str_data[pdbi]['FROM_PDB']['coordinate_file'], scan=True, options=options)
if PDB_dict and 'COORDS' in PDB_dict and PDB_dict['COORDS']:
write_mmCIF(str_data[pdbi], PDB_dict, str_data[pdbi]['FROM_PDB']['cache_coordinate_file'], LOC_pad)
OPMpdb_dict, OPM_report = parse_PDB_coords(str_data[pdbi]['FROM_OPM']['coordinate_file'], scan=True, options=options) # Can be null!
if OPMpdb_dict and 'COORDS' in OPMpdb_dict and OPMpdb_dict['COORDS']:
write_mmCIF(str_data[pdbi], OPMpdb_dict, str_data[pdbi]['FROM_OPM']['cache_coordinate_file'], LOC_pad)
# Checks on OPM and PDB structure (trust OPM more than PDB)
for tvec in [(OPMpdb_dict, OPM_report, 'opm'), (PDB_dict, PDB_report, 'pdb')]:
tmch = str_data[pdbi]['FROM_OPM']['ksubunits'] if tvec[2] == 'opm' else []
str_data[pdbi] = check_version(options, tvec, str_data[pdbi], pdbi, tm_chains=tmch)
# Check OPM-PDB elimination from previous checks
if 'pdb_eliminated' in str_data[pdbi]['status'] and 'opm_eliminated' in str_data[pdbi]['status']:
if not str_data[pdbi]['status'] or str_data[pdbi]['status'][-1] != 'eliminated':
str_data[pdbi]['status'].append('eliminated')
str_data[pdbi]['delete_keyword'] = 'Unclear'
str_data[pdbi]['PASSPORT'].append(passport_entry(this_name+'_nocoord', pdbi, 'There are no reliable coordinate files for this structure. The entry is scheduled for deletion.'))
pdbswitch = False # Flag for changing the reference to PDB. True when OPM was deemed incorrect or dangerous in the following post-processing
# In case OPM is not eliminated
if 'eliminated' not in str_data[pdbi]['status'] and 'opm_eliminated' not in str_data[pdbi]['status']:
# Decides whether to switch to PDB or to keep the OPM structure
pdbswitch = False
for sub_ch in str_data[pdbi]['FROM_OPM']['ksubunits']:
if not OPMpdb_dict or sub_ch not in OPMpdb_dict['COORDS']: