forked from shendurelab/LACHESIS
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathReporter.cc
1324 lines (968 loc) · 57.3 KB
/
Reporter.cc
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
///////////////////////////////////////////////////////////////////////////////
// //
// This software and its documentation are copyright (c) 2014-2015 by Joshua //
// N. Burton and the University of Washington. All rights are reserved. //
// //
// THIS SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS //
// OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF //
// MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND NON-INFRINGEMENT. //
// IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY //
// CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT //
// OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR //
// THE USE OR OTHER DEALINGS IN THE SOFTWARE. //
// //
///////////////////////////////////////////////////////////////////////////////
// For documentation, see Reporter.h
#include "Reporter.h"
#include "RunParams.h"
#include "TrueMapping.h"
#include "ClusterVec.h"
#include "ContigOrdering.h"
#include "GenomeLinkMatrix.h" // for MakeWholeAssemblyHeatmap only
#include "TextFileParsers.h" // ParseTabDelimFile()
// C++ includes
#include <assert.h>
#include <math.h> // isfinite
#include <vector>
#include <iostream>
#include <fstream>
#include <iomanip> // setprecision
#include <algorithm> // sort, max_element
#include <numeric> // accumulate
// Local includes
#include "TimeMem.h"
#include "gtools/N50.h"
#include "gtools/SAMStepper.h" // NTargetsInSAM(), TargetLengths()
// Boost libraries
#include <boost/dynamic_bitset.hpp>
#include <boost/lexical_cast.hpp>
// This is a local variable for dummy use. (It makes this module non-thread-safe, but who cares.)
static char _LINE[10000];
// AND(): Calculate the logical AND of two dynamic_bitsets.
boost::dynamic_bitset<>
AND( const boost::dynamic_bitset<> a, const boost::dynamic_bitset<> b )
{
assert ( a.size() == b.size() );
boost::dynamic_bitset<> c( a.size(), false );
for ( size_t i = 0; i < a.size(); i++ )
if ( a[i] && b[i] )
c[i] = true;
return c;
}
// SetToVec: Convert a set<int> into a vector<int> for easy random access.
vector<int>
SetToVec( const set<int> & s )
{
vector<int> v;
for ( set<int>::const_iterator it = s.begin(); it != s.end(); ++it )
v.push_back( *it );
return v;
}
// Calculate the Pearson correlation coefficient, r, indicating correlation between two data sets. Ranges from -1 to 1.
double
PearsonCorrelation( const vector<double> & x, const vector<double> & y )
{
int N = x.size();
assert( x.size() == y.size() );
// Calculate the means of x and y. Also find NaN's/inf's and flag them to be ignored.
double x_mean = 0, y_mean = 0;
vector<bool> ignore( N, false );
int N_ignore = 0;
for ( int i = 0; i < N; i++ ) {
if ( !isfinite( x[i] ) || !isfinite( y[i] ) ) {
ignore[i] = true;
N_ignore++;
}
else {
x_mean += x[i];
y_mean += y[i];
}
}
x_mean /= ( N - N_ignore );
y_mean /= ( N - N_ignore );
// Calculate the variances of x and y, and the covariance between them.
double covar = 0;
double x_var = 0, y_var = 0;
for ( int i = 0; i < N; i++ ) {
if ( ignore[i] ) continue;
double x_diff = x[i] - x_mean;
double y_diff = y[i] - y_mean;
covar += x_diff * y_diff;
x_var += x_diff * x_diff;
y_var += y_diff * y_diff;
}
return covar / sqrt( x_var * y_var );
}
// Constructor! Load in all the data, and do a bunch of sanity checks. trunks and orders can both be empty, or they can both be full.
Reporter::Reporter( const RunParams & run_params,
const ClusterVec & clusters,
const vector<ContigOrdering> & trunks,
const vector<ContigOrdering> & orders )
: _run_params( run_params ),
_true_mapping( run_params.LoadTrueMapping() ),
_N_contigs ( NTargetsInSAM( run_params._SAM_files[0] ) ),
_N_chroms ( _true_mapping ? _true_mapping->NTargets() : -1 ),
_N_clusters ( clusters.size() ),
_N_orderings( orders.size() ),
_contig_lengths( TargetLengths( run_params._SAM_files[0] ) ),
_total_contig_length( accumulate( _contig_lengths.begin(), _contig_lengths.end(), int64_t(0) ) ),
_clusters( clusters ),
_trunks( trunks ),
_orders( orders ),
_data( new ReporterData() )
{
cout << Time() << ": Launching a Reporter for a result with N_contigs = " << _N_contigs << ", N_chroms = " << _N_chroms << ", N_clusters = " << _N_clusters << ", N orderings = " << _N_orderings << ", total assembly length = " << _total_contig_length << endl;
/* SANITY CHECKS
*
* If any of these sanity checks fails, the data structures aren't from the same run.
*
*/
// The SAM file should be internally consistent.
assert( _N_contigs == (int) _contig_lengths.size() );
// Contig clusters must not contain more than the total number of contigs.
assert( (int) clusters.SizeSum() <= _N_contigs );
// The clusters must be non-empty and must contain contig IDs in the appropriate range
int N_contigs_in_clusters = 0;
for ( size_t i = 0; i < clusters.size(); i++ ) {
assert( !clusters[i].empty() );
assert( *clusters[i].rbegin() < _N_contigs );
N_contigs_in_clusters += clusters[i].size();
}
// If ordering has been run, there must be orderings available. (If there are no orderings, no ordering evaluation will be done.)
if ( run_params._do_ordering ) {
// There may not be as many orderings as there are clusters; all orderings may not have been calculated. But there can't be more.
assert( _N_orderings > 0 );
assert( _N_orderings <= _N_clusters );
// The trunk orderings and full orderings must be equal in number and must come from the same size clusters.
assert( (int) _trunks.size() == _N_orderings );
for ( int i = 0; i < _N_orderings; i++ ) {
assert( orders[i].N_contigs() == trunks[i].N_contigs() );
// All of the orderings must match their corresponding clustering by ID.
if ( (int) clusters[i].size() != orders[i].N_contigs() ) PRINT3( i, clusters[i].size(), orders[i].N_contigs() );
assert( (int) clusters[i].size() == orders[i].N_contigs() );
// Trunk orderings must be subsets of full orderings.
assert( trunks[i].N_contigs_used() <= orders[i].N_contigs_used() );
for ( int j = 0; j < _trunks[i].N_contigs_used(); j++ )
assert( orders[i].contig_used( trunks[i].contig_ID(j) ) );
}
}
// Sanity checks for when there's a TrueMapping (i.e., not a reference-free assembly).
if ( _true_mapping ) {
// The species names must match.
assert( _run_params._species == _true_mapping->species() );
// The SAM file is aligned to the draft assembly, so it must show the same number of targets as the TrueMapping has queries (i.e., assembly contigs).
assert( _N_contigs == _true_mapping->NQueries() );
}
}
Reporter::~Reporter()
{
if ( _true_mapping ) delete _true_mapping;
delete _data;
}
void
Reporter::Eval() const
{
// Evaluate the clustering.
EvalClustering();
// Evaluate all the orderings, if there are any.
if ( _N_orderings > 0 ) {
for ( int i = 0; i < _N_orderings; i++ ) {
EvalOrderAccuracy( i, false ); // trunks
EvalOrderAccuracy( i, true ); // full orderings
}
}
// The evaluations past this point are all entirely reference-based.
if ( _true_mapping == NULL ) return;
// Evaluate the contig spacing in the full ordering for now. This can't be done without a reference.
if ( _orders.size() >= 3 && _orders[3].has_gaps() ) EvalGapSizes( 3, true ); // just one chromosome
//DotplotOrderAccuracy( 3, true, false, "POA.3.full.no_interchrom" );
return; // TEMP: skip dotplots for now (save time)
if ( _N_orderings > 0 ) HistogramOrderingErrors( true, "HOE" );
// Create colored dotplots to show the accuracy of the orderings.
for ( int i = 0; i < _N_orderings; i++ ) {
string i_str = boost::lexical_cast<string>(i);
DotplotOrderAccuracy( i, false, false, "POA." + i_str + ".trunk.no_interchrom" );
DotplotOrderAccuracy( i, true, false, "POA." + i_str + ".full.no_interchrom" );
DotplotOrderAccuracy( i, false, true, "POA." + i_str + ".trunk.with_interchrom" );
DotplotOrderAccuracy( i, true, true, "POA." + i_str + ".full.with_interchrom" );
}
}
// Count which contigs are clustered, and evaluate the clustering by comparing it to the TrueMapping.
// Report on contigs that appear to be mis-clustered (i.e., they belong to a different chromosome than the plurality of other contigs in their cluster.)
// This code is similar to GenomeLinkMatrix::ReportMisclusteredContigs(), but doesn't go into as much detail because only topline numbers are expected, not
// troubleshooting help.
void
Reporter::EvalClustering() const
{
cout << Time() << ": EvalClustering" << endl;
// Mark contigs that are in the cluster.
_data->in_cluster.resize( _N_contigs );
for ( int i = 0; i < _N_clusters; i++ )
for ( set<int>::const_iterator it = _clusters[i].begin(); it != _clusters[i].end(); ++it )
_data->in_cluster[*it] = true;
// If there's no reference, nothing more can be done.
if ( _true_mapping == NULL ) return;
_data->cluster_chrom.clear();
int N_aligned_total = 0;
int N_misclustered = 0;
int64_t len_aligned_total = 0;
int64_t len_misclustered = 0;
// For each cluster, calculate the plurality chromosome and figure out the amount of contigs/sequence on that chromosome.
for ( size_t i = 0; i < _clusters.size(); i++ ) {
if ( _clusters[i].empty() ) continue;
// Tallies of which chromosomes contain the contigs in this cluster.
vector<int> aligns( _N_chroms, 0 );
vector<int64_t> align_len( _N_chroms, 0 );
// First, find the chromosome (if any) to which each contig in this cluster aligns.
bool seen_aligned = false;
int N_aligned = 0, N_unaligned = 0;
int64_t len_aligned = 0, len_unaligned = 0;
for ( set<int>::const_iterator it = _clusters[i].begin(); it != _clusters[i].end(); ++it ) {
int chrID = _true_mapping->QTargetID(*it);
// Kludgey code to collapse/merge chromosomes for the purposes of marking "correct" or "incorrect" chromosomes.
//if ( chrID == 6 ) chrID = 2; // fly: chr2L -> chr2R
//if ( chrID == 7 ) chrID = 4; // fly: chr3L -> chr3R
//if ( chrID == 21 ) chrID = 18; // human: chr22 -> chr19
//if ( chrID == 20 ) chrID = 19; // human: chr21 -> chr20
//if ( chrID == 15 || chrID == 21 ) chrID = 18; // postfosmid: chr16, chr22 -> chr19
//if ( chrID == 16 || chrID == 20 ) chrID = 19; // postfosmid: chr17, chr21 -> chr20
if ( chrID == -1 ) {
N_unaligned++;
len_unaligned += _contig_lengths[*it];
}
else {
N_aligned++;
len_aligned += _contig_lengths[*it];
aligns[chrID]++;
align_len[chrID] += _contig_lengths[*it];
seen_aligned = true;
}
}
// Now find which chromosome that contains the plurality of these alignments (measured by length, not number of contigs.)
int N_aligned_correct = 0;
int64_t len_aligned_correct = 0;
int cluster_ID = -1;
for ( size_t j = 0; j < aligns.size(); j++ )
if ( len_aligned_correct < align_len[j] ) {
len_aligned_correct = align_len[j];
N_aligned_correct = aligns[j];
cluster_ID = j;
}
// If none of the contigs in this cluster align, just go ahead and leave them all marked as incorrectly clustered.
assert( ( cluster_ID == -1 ) == ( !seen_aligned ) );
// Add to global tallies.
N_aligned_total += N_aligned;
N_misclustered += N_aligned - N_aligned_correct;
len_aligned_total += len_aligned;
len_misclustered += len_aligned - len_aligned_correct;
_data->cluster_chrom.push_back( cluster_ID );
_data->cluster_N_good .push_back( N_aligned_correct );
_data->cluster_len_good.push_back( len_aligned_correct );
_data->cluster_N_bad .push_back( N_aligned - N_aligned_correct );
_data->cluster_len_bad .push_back( len_aligned - len_aligned_correct );
_data->cluster_N_unaligned .push_back( N_unaligned );
_data->cluster_len_unaligned.push_back( len_unaligned );
}
}
// Count which contigs are ordered and oriented in an ordering, and evaluate the accuracy by comparing it to the TrueMapping.
// If full_order, then eval _orders[cluster_ID]; otherwise eval _trunks[cluster_ID].
void
Reporter::EvalOrderAccuracy( int cluster_ID, bool full_order ) const
{
bool verbose = false;
// Initialize the variables in the ReporterData struct.
OrderingFlags & flags = full_order ? _data->order_flags : _data->trunk_flags;
flags.used .resize( _N_contigs );
flags.high_quality.resize( _N_contigs );
flags.unaligned .resize( _N_contigs );
flags.chr_mismatch.resize( _N_contigs );
flags.order_error .resize( _N_contigs );
flags.orient_error.resize( _N_contigs );
// Convert this cluster to a local vector<> for easier random access.
const vector<int> cluster = SetToVec( _clusters[cluster_ID] );
const ContigOrdering & scaffold = full_order ? _orders[cluster_ID] : _trunks[cluster_ID];
//scaffold.Print();
if ( scaffold.N_contigs() != (int) cluster.size() ) PRINT4( cluster_ID, full_order, scaffold.N_contigs(), cluster.size() );
assert( scaffold.N_contigs() == (int) cluster.size() );
int N_contigs_used = scaffold.N_contigs_used();
// Step through all the contigs in the scaffold.
for ( int i = 0; i < N_contigs_used; i++ ) {
// Find the contig position and orientation according to the scaffold.
int contig0 = cluster[ scaffold.contig_ID(i) ];
flags.used[contig0] = true;
// Apply the quality filter (the threshold of which depends on what assembly this is) and determine whether or not this contig passes.
// If no quality scores are given, the contig automatically passes.
flags.high_quality[contig0] = !scaffold.has_Q_scores() || scaffold.contig_orient_Q(i) >= _run_params._report_quality_filter;
// If we have no reference, we can't do any error marking.
if ( _true_mapping == NULL ) continue;
// To help call errors, find the previous two contig positions and orientations.
int contig1 = ( i == 0 ? -1 : cluster[ scaffold.contig_ID(i-1) ] );
int contig2 = ( i <= 1 ? -1 : cluster[ scaffold.contig_ID(i-2) ] );
// Find the contig position and orientation of each of these contigs, according to the TrueMapping.
int chrom0 = _true_mapping->QTargetID (contig0);
int start0 = _true_mapping->QTargetStart(contig0);
int stop0 = _true_mapping->QTargetStop (contig0);
//if ( _true_mapping->QRC(contig0) ) { int swap = start0; start0 = stop0; stop0 = swap; }
int chrom1 = -1, start1 = -1, stop1 = -1;
int chrom2 = -1, start2 = -1, stop2 = -1;
if ( i >= 1 ) {
chrom1 = _true_mapping->QTargetID (contig1);
start1 = _true_mapping->QTargetStart(contig1);
stop1 = _true_mapping->QTargetStop (contig1);
//if ( _true_mapping->QRC(contig1) ) { int swap = start1; start1 = stop1; stop1 = swap; }
}
if ( i >= 2 ) {
chrom2 = _true_mapping->QTargetID (contig2);
start2 = _true_mapping->QTargetStart(contig2);
stop2 = _true_mapping->QTargetStop (contig2);
//if ( _true_mapping->QRC(contig2) ) { int swap = start2; start2 = stop2; stop2 = swap; }
}
// Mark contigs that aren't aligned to (canonical) chromosomes.
if ( chrom0 == -1 ) flags.unaligned[contig0] = true;
// Test for inter-chromosome errors.
// If there is an error - i.e., this contig and its immediate predecessor both align to different canonical chromosomes - mark both contigs as mismatched.
if ( chrom0 != -1 && chrom1 != -1 && chrom0 != chrom1 ) {
flags.chr_mismatch[contig0] = true;
flags.chr_mismatch[contig1] = true;
if ( verbose ) cout << "ERROR\tCLUSTERING\tCLUSTER:\t" << cluster_ID << endl;
}
// Find the relative positions of this contig w.r.t. its predecessor(s).
// pos = 1 means the subsequent contig is after the predecessor on the chromosome;
// pos = -1 means the subsequent contig is before the predecessor on the chromosome;
// pos = 0 means they overlap.
// Note that if i <= 1, one or both of these will evaluate to 1, and that's ok.
int pos01 = ( start0 > start1 && stop0 > stop1 ) ? 1 : ( start0 < start1 && stop0 < stop1 ) ? -1 : 0;
int pos12 = ( start1 > start2 && stop1 > stop2 ) ? 1 : ( start1 < start2 && stop1 < stop2 ) ? -1 : 0;
// If this contig and its predecessor both align to the same canonical chromosome...
if ( i >= 1 && chrom0 != -1 && chrom0 == chrom1 ) {
// Look for contig orientation errors (i.e., this contig's orientation is not what we would expect based on its position w.r.t. its predecessor.)
bool expected_rc = _true_mapping->QRC(contig0) ^ scaffold.contig_rc(i);
if ( ( expected_rc && pos01 == 1 ) || ( !expected_rc && pos01 == -1 ) ) {
flags.orient_error[contig0] = true;
if ( verbose ) cout << "ERROR\tORIENTATION\tCLUSTER:\t" << cluster_ID << "\tCONTIG IN CLUSTER:\t" << i << "\tLENGTH:\t" << _contig_lengths[contig0] << "\tCONTIG GLOBAL ID:\t" << contig0 << "\tTRUE POSITION: " << _true_mapping->QTargetName(contig0) << ":" << _true_mapping->QTargetStart(contig0) << "-" << _true_mapping->QTargetStop(contig0) << endl;
}
//else
//cout << "ORIENTING OK!\tQ = " << scaffold.contig_orient_Q(i) << endl;
}
// If this contig and its two predecessors both align to the same canonical chromosome...
if ( i >= 2 && chrom0 != -1 && chrom0 == chrom1 && chrom0 == chrom2 ) {
// Look for contig ordering errors: three contigs are adjacent in an ordering, but their relative position in the ordering doesn't match the relative
// positions of their true alignments.
if ( pos01 != pos12 && pos01 != 0 && pos12 != 0 ) {
if ( verbose ) cout << "ERROR\tORDERING\tCLUSTER:\t" << cluster_ID << "\tCONTIG IN CLUSTER:\t" << i-1 << "\tLENGTH:\t" << _contig_lengths[contig1] << "\tCONTIG GLOBAL ID:\t" << contig1 << "\tTRUE POSITION: " << _true_mapping->QTargetName(contig1) << ":" << _true_mapping->QTargetStart(contig1) << "-" << _true_mapping->QTargetStop(contig1) << endl;
flags.order_error[contig1] = true;
}
}
}
if ( verbose && _true_mapping != NULL )
for ( int i = 0; i < N_contigs_used; i++ ) {
int contig = cluster[ scaffold.contig_ID(i) ];
if ( !_true_mapping->QMaps(contig) ) continue;
if ( flags.order_error[contig] ) cout << "YESERROR:\t\t" << contig << "\tTRUE POSITION: " << _true_mapping->QTargetName(contig) << ":" << _true_mapping->QTargetStart(contig) << "-" << _true_mapping->QTargetStop(contig) << endl;
if ( !flags.order_error[contig] ) cout << "NOERROR:\t\t" << contig << "\tTRUE POSITION: " << _true_mapping->QTargetName(contig) << ":" << _true_mapping->QTargetStart(contig) << "-" << _true_mapping->QTargetStop(contig) << endl;
}
}
// Evaluate the contig spacing.
// This function should be called after calling EvalOrderAccuracy on the same ordering, so that the _data flags are filled.
// TODO: still working on getting contig spacing to work (ChromLinkMatrix::SpaceContigs) thus still working on this file
void
Reporter::EvalGapSizes( int cluster_ID, bool full_order ) const
{
RequireReference(); // can't eval gap sizes without a reference
// Convert this cluster to a local vector<> for easier random access.
const vector<int> cluster = SetToVec( _clusters[cluster_ID] );
// Get the scaffold.
const ContigOrdering & scaffold = full_order ? _orders[cluster_ID] : _trunks[cluster_ID];
int N_contigs_used = scaffold.N_contigs_used();
// Get the flags indicating errors. If EvalOrderAccuracy() hasn't been called on this ordering, these asserts will fail.
const OrderingFlags & flags = full_order ? _data->order_flags : _data->trunk_flags;
assert( !flags.used.empty() );
assert( !flags.unaligned .empty() );
assert( !flags.chr_mismatch.empty() );
assert( !flags.order_error .empty() );
assert( !flags.orient_error.empty() );
// Find the GC content and mappability for each contig.
// We'll correlate these measures with the gap size estimation error of each contig, and attempt to determine the cause of errors.
vector<double> contig_GC = ParseTabDelimFile<double>( "../human/prefosmid/assembly.GCpct", 1 );
vector<double> contig_mappability = ParseTabDelimFile<double>( "../human/prefosmid/assembly.MQ>0pct", 1 );
assert( (int) contig_GC .size() == _N_contigs );
assert( (int) contig_mappability.size() == _N_contigs );
//vector<double> contig_RF = ParseTabDelimFile<double>( "RFs.txt", 0 ); // TEMP
vector<double> contig_LDE = ParseTabDelimFile<double>( "enrichments.txt", 0 ); // LDE = link density enrichment (on contig, compared to expectations)
vector<double> dists, dists_est, errors, GCs1, GCs2, GCs_comb, maps1, maps2, maps_comb, LDEs1, LDEs2, LDEs_comb, RFs_comb;
// Step through all the pairs of adjacent contigs in the scaffold.
for ( int i = 0; i+1 < N_contigs_used; i++ ) {
int contig1 = cluster[ scaffold.contig_ID(i) ];
int contig2 = cluster[ scaffold.contig_ID(i+1) ];
int dist_est = scaffold.gap_size(i);
if ( dist_est == INT_MAX ) continue; // this can happen if the gaps were estimated by SpaceContigs() using a subset of the SAM files
// Ignore pairs of adjacent contigs unless both of them map to the reference, on the same chromosome, with no apparent ordering or orienting errors.
if ( flags.unaligned [contig1] || flags.unaligned [contig2] ) continue;
if ( flags.chr_mismatch[contig1] || flags.chr_mismatch[contig2] ) continue;
if ( flags.order_error [contig1] || flags.order_error [contig2] ) continue;
if ( flags.orient_error[contig1] || flags.orient_error[contig2] ) continue;
// TEMP: Only look at big contigs.
// threshold types: OR < additive < multiplicative < AND
// OR = 0.08; additive = 0.17; multiplicative = 0.20; AND = 0.28
//if ( _contig_lengths[contig1] < 500000 && _contig_lengths[contig2] < 500000 ) continue;
//if ( _contig_lengths[contig1] + _contig_lengths[contig2] < 700000 ) continue;
//if ( _contig_lengths[contig1] / 1000 * _contig_lengths[contig2] / 1000 < 80000 ) continue;
//if ( _contig_lengths[contig1] < 200000 || _contig_lengths[contig2] < 200000 ) continue;
// Find the true distance between these contigs, according to their reference alignments.
assert( _true_mapping->QTargetID( contig1 ) == _true_mapping->QTargetID( contig2 ) );
int start1 = _true_mapping->QTargetStart( contig1 );
int stop1 = _true_mapping->QTargetStop ( contig1 );
int start2 = _true_mapping->QTargetStart( contig2 );
int stop2 = _true_mapping->QTargetStop ( contig2 );
int dist = max( start2 - stop1, start1 - stop2 );
if ( dist < 100 ) dist = 100; // if the contigs overlap, define the distance as 100
//PRINT7( contig1, contig2, start1, stop1, start2, stop2, dist );
cout << "EvalGapSizes: Contigs #" << i << ", " << i+1 << " in the scaffold\t(global IDs: " << contig1 << ", " << contig2 << ", lengths = " << _contig_lengths[contig1] << ", " << _contig_lengths[contig2] << ")\tare at a true distance of\t" << dist << "\tvs. predicted distance of\t" << dist_est << endl;
// TODO: compare these true distances to the distances found in ChromLinkMatrix::SpaceContigs, and investigate differences
// grep COMPARISON b | cut -f2,3 > c ; QuickDotplot c
cout << "COMP1\t" << log(dist+1) << "\t" << log(dist_est+1) << endl;
double error = double( dist_est+1 ) / (dist+1); // "error" = the multiplicative factor of wrongness by which the estimate exceeds the true distance
double GC1 = contig_GC [contig1], GC2 = contig_GC[contig2];
double map1 = contig_mappability[contig1], map2 = contig_mappability[contig2];
double LDE1 = contig_LDE [contig1], LDE2 = contig_LDE[contig2];
//double RF1 = contig_RF [contig1], RF2 = contig_RF[contig2];
double GC_combined = ( GC1 * _contig_lengths[contig1] + GC2 * _contig_lengths[contig2] ) / ( _contig_lengths[contig1] + _contig_lengths[contig2] );
double map_combined = ( map1 * _contig_lengths[contig1] + map2 * _contig_lengths[contig2] ) / ( _contig_lengths[contig1] + _contig_lengths[contig2] );
double LDE_combined = ( LDE1 * _contig_lengths[contig1] + LDE2 * _contig_lengths[contig2] ) / ( _contig_lengths[contig1] + _contig_lengths[contig2] );
//double RF_combined = ( RF1 * _contig_lengths[contig1] + RF2 * _contig_lengths[contig2] ) / ( _contig_lengths[contig1] + _contig_lengths[contig2] );
LDE_combined = log( LDE_combined );
cout << "COMP2\t" << GC_combined << '\t' << log(error) << endl;
cout << "COMP3\t" << map_combined << '\t' << log(error) << endl;
cout << "COMP4\t" << LDE_combined << '\t' << log(error) << endl;
//cout << "COMP4\t" << RF_combined << '\t' << log(error+1) << endl;
dists_est.push_back( log(dist_est+1) );
dists .push_back( log(dist+1) );
errors .push_back( log(error) );
GCs1 .push_back( GC1 );
GCs2 .push_back( GC2 );
GCs_comb .push_back( GC_combined );
maps1 .push_back( map1 );
maps2 .push_back( map2 );
maps_comb.push_back( map_combined );
LDEs1 .push_back( LDE1 );
LDEs2 .push_back( LDE2 );
LDEs_comb.push_back( LDE_combined );
//RFs_comb .push_back( RF_combined );
}
int N_data = dists.size();
PRINT( N_data );
PRINT( PearsonCorrelation( dists, dists_est ) );
PRINT( PearsonCorrelation( errors, GCs_comb ) );
PRINT( PearsonCorrelation( errors, maps_comb ) );
PRINT( PearsonCorrelation( errors, LDEs_comb ) );
//PRINT( PearsonCorrelation( errors, RFs_comb ) );
}
// ReportChartWithReference: Print out all the ReporterData info in a pretty chart. This is the version for an assembly with a reference available.
void
Reporter::ReportChartWithReference() const
{
RequireReference();
assert( _N_clusters == (int) _data->cluster_chrom.size() );
assert( _N_clusters == (int) _data->cluster_N_good.size() );
assert( _N_clusters == (int) _data->cluster_N_bad.size() );
assert( _N_clusters == (int) _data->cluster_N_unaligned.size() );
assert( _N_clusters == (int) _data->cluster_len_good.size() );
assert( _N_clusters == (int) _data->cluster_len_bad.size() );
assert( _N_clusters == (int) _data->cluster_len_unaligned.size() );
const double epsilon = 1.0e-12; // small number to add to denominators to avoid division by 0
// Open the ReportChart output file.
string report_chart_file = _run_params._out_dir + "/REPORT.txt";
cout << Time() << ": Writing a ReportChart to " << report_chart_file << endl;
ofstream out( report_chart_file.c_str() );
// Write the run parameters to the file.
_run_params.PrintParams( out );
out << "\n\n\n\n\n";
// Print basic info about this assembly.
out << Time() << ": ReportChart!" << endl << endl;
out << "Info about input assembly:" << endl;
if ( _run_params._sim_bin_size > 0 ) out << "REFERENCE GENOME CHOPPED UP INTO " << _run_params._sim_bin_size << "-BP BINS" << endl;
else out << "DE NOVO ASSEMBLY, reference genome available for validation" << endl;
out << "Species: " << _run_params._species << endl;
out << "N chromosomes in reference, including non-canonical:\t" << _N_chroms << endl;
out << "N contigs:\t" << _N_contigs << "\t\tTotal length:\t" << _total_contig_length << "\t\tN50:\t" << N50( _contig_lengths ) << endl;
out << "N clusters (derived):\t\t" << _N_clusters << endl;
out << "N non-singleton clusters:\t" << N_non_singleton_clusters() << endl;
out << "N orderings found:\t\t" << _N_orderings << endl;
out << endl << endl;
out << "############################\n";
out << "# #\n";
out << "# CLUSTERING METRICS #\n";
out << "# #\n";
out << "############################\n\n\n";
out << setprecision(4);
// Derive and print basic numbers about clustering.
int N_in_clusters = _data->in_cluster.count();
int64_t len_in_clusters = contig_length_if( _data->in_cluster );
double pct_N_in_clusters = 100.0 * N_in_clusters / _N_contigs;
double pct_len_in_clusters = 100.0 * len_in_clusters / _total_contig_length;
out << "Number of contigs in clusters:\t" << N_in_clusters << "\t\t(" << pct_N_in_clusters << "% of all contigs)" << endl;
out << "Length of contigs in clusters:\t" << len_in_clusters << "\t(" << pct_len_in_clusters << "% of all sequence length)" << endl;
out << endl;
int N_singletons = 0;
int64_t len_singletons = 0;
// Print a chart with info about each cluster, including what portion of the cluster (length, contigs) aligns to the plurality chromosome or fails to align.
string horiz_line = "+---------+----------------+---------+---------------+------------------+-------------+-------------------+----------------------+\n";
out << horiz_line;
out << "| CLUSTER | PLURALITY | Number of contigs in cluster... | Length of contigs in cluster... |\n";
out << "| NUMBER | CHROMOSOME | TOTAL | UNALIGNED | WRONG CHROMOSOME | TOTAL | UNALIGNED | WRONG CHROMOSOME |\n";
out << horiz_line;
for ( int i = 0; i < _N_clusters; i++ ) {
// Skip singleton clusters.
if ( _clusters[i].size() == 1 ) {
N_singletons++;
len_singletons += _contig_lengths[ *(_clusters[i].begin()) ];
continue;
}
int chrom_ID = _data->cluster_chrom[i];
// Re-format the chromosome name to appear nicely in the output chart: <= 14 characters and centered in whitespace.
string chrom_name = ( chrom_ID == -1 ? "none" : _true_mapping->TargetName(chrom_ID) );
chrom_name = chrom_name.substr(0,14);
for ( size_t j = 0; j + chrom_name.size() < 14; j++ ) chrom_name += ' '; // note that chrom_name.size() increases with each iteration
// Calculate the total cluster length.
boost::dynamic_bitset<> in_cluster( _N_contigs );
int cluster_N = 0;
for ( set<int>::const_iterator it = _clusters[i].begin(); it != _clusters[i].end(); ++it ) {
cluster_N++;
in_cluster[*it] = true;
}
int64_t cluster_len = contig_length_if( in_cluster );
// For each chromosome, calculate percentages, and print a line to the chart.
double pct_N_bad = 100.0 * _data->cluster_N_bad [i] / ( _data->cluster_N_good [i] + _data->cluster_N_bad [i] + epsilon );
double pct_len_bad = 100.0 * _data->cluster_len_bad[i] / ( _data->cluster_len_good[i] + _data->cluster_len_bad[i] + epsilon );
double pct_N_unaligned = 100.0 * _data->cluster_N_unaligned[i] / cluster_N;
double pct_len_unaligned = 100.0 * _data->cluster_len_unaligned[i] / cluster_len;
sprintf( _LINE, "|%6d | %14s |%7ld |%5d (%6.2f%%)| %5d (%6.2f%%) | %11ld |%9ld (%6.3f%%)| %9ld (%6.3f%%) |\n", i, chrom_name.c_str(), _clusters[i].size(), _data->cluster_N_unaligned[i], pct_N_unaligned, _data->cluster_N_bad[i], pct_N_bad, cluster_len, _data->cluster_len_unaligned[i], pct_len_unaligned, _data->cluster_len_bad[i], pct_len_bad );
out << _LINE;
}
out << horiz_line;
// Print a final line with totals.
int total_N_good = accumulate( _data->cluster_N_good .begin(), _data->cluster_N_good .end(), 0 );
int64_t total_len_good = accumulate( _data->cluster_len_good.begin(), _data->cluster_len_good.end(), int64_t(0) );
int total_N_bad = accumulate( _data->cluster_N_bad .begin(), _data->cluster_N_bad .end(), 0 );
int64_t total_len_bad = accumulate( _data->cluster_len_bad .begin(), _data->cluster_len_bad .end(), int64_t(0) );
int total_N_unaligned = accumulate( _data->cluster_N_unaligned .begin(), _data->cluster_N_unaligned.end(), 0 );
int64_t total_len_unaligned = accumulate( _data->cluster_len_unaligned.begin(), _data->cluster_len_unaligned.end(), int64_t(0) );
double pct_N_bad = 100.0 * total_N_bad / ( total_N_good + total_N_bad + epsilon );
double pct_len_bad = 100.0 * total_len_bad / ( total_len_good + total_len_bad + epsilon );
double pct_N_unaligned = 100.0 * total_N_unaligned / N_in_clusters;
double pct_len_unaligned = 100.0 * total_len_unaligned / len_in_clusters;
sprintf( _LINE, "| TOTAL |%7d |%5d (%6.2f%%)| %5d (%6.2f%%) | %11ld |%9ld (%6.3f%%)| %9ld (%6.3f%%) |\n",
N_in_clusters, total_N_unaligned, pct_N_unaligned, total_N_bad, pct_N_bad, len_in_clusters, total_len_unaligned, pct_len_unaligned, total_len_bad, pct_len_bad );
out << _LINE;
out << horiz_line;
if ( N_singletons > 0 )
out << endl << "NOTE: The above chart omits " << N_singletons << " `singleton' clusters, with only one contig each (total length = " << len_singletons << ")" << endl;
out << endl << endl;
if ( _N_orderings == 0 ) return;
out << "############################\n";
out << "# #\n";
out << "# ORDERING METRICS #\n";
out << "# #\n";
out << "############################\n\n\n";
// Derive and print basic numbers about ordering.
ReportChartOrderingPercentages(out);
// This horizontal line matches the chart lines in ReportChartOrderingErrors().
horiz_line = "+-------------+---------------------------+--------------+--------------+\n";
// Print the chart for ordering errors.
out << horiz_line;
out << "| TRUNK/FULL? | ERROR TYPE | % CONTIGS | % LENGTH |\n";
out << horiz_line;
ReportChartOrderingErrors( false, out ); // trunk only
out << horiz_line;
ReportChartOrderingErrors( true, out ); // full ordering
out << horiz_line;
out << endl << endl;
// Print a footer with useful definitions.
out << "###################################\n\n";
out << "Definitions:" << endl;
out << "'Plurality chromosome':\t\tThe plurality chromosome for a cluster is the chromosome that contains the most contigs (by length!) in that cluster.\n\t\t\t\tContigs are considered mis-clustered if they align to a canonical chromosome other than the plurality chromosome.\n";
out << "'Inter-chromosome error':\tTwo contigs are adjacent in an ordering but align to different canonical chromosomes.\n";
out << "'Ordering error':\t\tThree contigs are adjacent in an ordering, but their relative position in the ordering doesn't match the relative positions of their true alignments.\n";
out << "'Orientation error':\t\tA contig's assigned orientation in its ordering doesn't match the orientation it should have, imputed from its and its neighbors' true alignment position.\n";
out << endl << endl;
out.close();
}
// ReportChartNoReference: Print out all the ReporterData info in a pretty chart. This is the version for a reference-free assembly.
void
Reporter::ReportChartNoReference() const
{
assert( !_run_params._use_ref );
// Open the ReportChart output file.
string report_chart_file = _run_params._out_dir + "/REPORT.txt";
cout << Time() << ": Writing a ReportChart to " << report_chart_file << endl;
ofstream out( report_chart_file.c_str() );
// Write the run parameters to the file.
_run_params.PrintParams( out );
out << "\n\n\n\n\n";
// Print basic info about this assembly.
out << Time() << ": ReportChart!" << endl << endl;
out << "Info about input assembly:" << endl;
out << "DE NOVO ASSEMBLY, with no reference genome (less validation available)" << endl;
out << "Species: " << _run_params._species << endl;
out << "N contigs:\t" << _N_contigs << "\t\tTotal length:\t" << _total_contig_length << "\t\tN50:\t" << N50( _contig_lengths ) << endl;
out << "N clusters (derived):\t" << _N_clusters << endl;
out << "N non-singleton clusters:\t" << N_non_singleton_clusters() << endl;
out << "N orderings found:\t" << _N_orderings << endl;
out << endl << endl;
out << "############################\n";
out << "# #\n";
out << "# CLUSTERING METRICS #\n";
out << "# #\n";
out << "############################\n\n\n";
out << setprecision(4);
// Derive and print basic numbers about clustering.
int N_in_clusters = _data->in_cluster.count();
int64_t len_in_clusters = contig_length_if( _data->in_cluster );
double pct_N_in_clusters = 100.0 * N_in_clusters / _N_contigs;
double pct_len_in_clusters = 100.0 * len_in_clusters / _total_contig_length;
out << "Number of contigs in clusters:\t" << N_in_clusters << "\t\t(" << pct_N_in_clusters << "% of all contigs)" << endl;
out << "Length of contigs in clusters:\t" << len_in_clusters << "\t(" << pct_len_in_clusters << "% of all sequence length)" << endl;
out << endl;
// Print a chart with info about each cluster. It's not a ton of info because there's no reference.
string horiz_line = "+----------+-----------+-------------+\n";
out << horiz_line;
out << "| CLUSTER | NUMBER OF | LENGTH OF |\n";
out << "| NUMBER | CONTIGS | CONTIGS | \n";
out << horiz_line;
for ( int i = 0; i < _N_clusters; i++ ) {
// Calculate the total cluster length.
boost::dynamic_bitset<> in_cluster( _N_contigs );
for ( set<int>::const_iterator it = _clusters[i].begin(); it != _clusters[i].end(); ++it )
in_cluster[*it] = true;
int64_t cluster_len = contig_length_if( in_cluster );
sprintf( _LINE, "| %6d | %7ld | %11ld |\n", i, _clusters[i].size(), cluster_len );
out << _LINE;
}
out << horiz_line;
// Print a final line with totals.
sprintf( _LINE, "| TOTAL | %7d | %11ld |\n", N_in_clusters, len_in_clusters );
out << _LINE;
out << horiz_line;
out << endl << endl;
if ( _N_orderings == 0 ) return;
out << "############################\n";
out << "# #\n";
out << "# ORDERING METRICS #\n";
out << "# #\n";
out << "############################\n\n\n";
// Derive and print basic numbers about ordering.
ReportChartOrderingPercentages(out);
out.close();
}
// RequireReference: Throw a verbose error if _true_mapping == NULL. This should always be called before anything that uses _true_mapping.
void
Reporter::RequireReference() const
{
if ( _true_mapping != NULL ) return;
cerr << "ERROR: Reporter module tried to call an evaluation function requiring a TrueMapping object, but there's no TrueMapping because this is a reference-free assembly (USE_REFERENCE=0)." << endl;
exit(1);
}
// HistogramOrderingErrors: Make a histogram showing the odds of a contig being mis-ordered, as a function of the contig's length.
void
Reporter::HistogramOrderingErrors( const bool full_order, const string & file_head ) const
{
RequireReference();
// For contig 'lengths', either use actual lengths or the number of RE sites.
bool use_RE_lens = true;
vector<int> lens = use_RE_lens ? ParseTabDelimFile<int>( _run_params.DraftContigRESitesFilename(), 1 ) : _contig_lengths;
// If this assert fails, the input contig_lengths file is inconsistent with the original dataset (wrong number of contigs).
assert( (int) lens.size() == _N_contigs );
cout << Time() << ": HistogramOrderingErrors with file_head = " << file_head << endl;
const OrderingFlags & flags = full_order ? _data->order_flags : _data->trunk_flags;
// Make logarithmic-sized bins for contig lengths. Bin #N will contain data for contigs with sizes in the range 2^N <= size < 2^(N+1).
int longest_contig = *( max_element( lens.begin(), lens.end() ) );
int N_bins = 1; while ( longest_contig >>= 1 ) N_bins++;
//PRINT2( longest_contig, N_bins );
vector<int64_t> N_total( N_bins, 0 ), len_total( N_bins, 0 ), N_order_error( N_bins, 0 ), N_orient_error( N_bins, 0 ), N_o_and_o_error( N_bins, 0 );
// Now loop over all contigs and gather data.
for ( int i = 0; i < _N_contigs; i++ ) {
// Skip contigs that aren't used or that don't align; these don't have the option of being ordering errors.
if ( !flags.used[i] || flags.unaligned[i] ) continue;
// For each contig, look up its length, and determine the bin it belongs in.
int len = lens[i];
int len_bin = 0; while ( len >>= 1 ) len_bin++;
assert( len_bin < N_bins );
// Look up the contig's error(s) and fill the data structures.
N_total[len_bin]++;
len_total[len_bin] += lens[i];
if ( flags.order_error[i] ) N_order_error [len_bin]++;
if ( flags.orient_error[i] ) N_orient_error[len_bin]++;
if ( flags.order_error[i] && flags.orient_error[i] ) N_o_and_o_error[len_bin]++;
}
// Make the histogram.
ofstream out( file_head.c_str() );
out << "#log2(contig_N_RE_sites)\tN_contigs\ttotal_contig_N_RE_sites\tpct_ok\tpct_order_error\tpct_orient_error\tpct_order_and_orient_errors" << endl;
for ( int i = 0; i < N_bins; i++ ) {
double pct_order_error = 100.0 * N_order_error[i] / N_total[i];
double pct_orient_error = 100.0 * N_orient_error[i] / N_total[i];
double pct_o_and_o_error = 100.0 * N_o_and_o_error[i] / N_total[i];
//PRINT5( i, N_total[i], pct_order_error, pct_orient_error, pct_o_and_o_error );
out << i
<< '\t' << N_total[i]
<< '\t' << len_total[i]
<< '\t' << ( 100 - pct_order_error - pct_orient_error + pct_o_and_o_error )
<< '\t' << ( pct_order_error - pct_o_and_o_error )
<< '\t' << ( pct_orient_error - pct_o_and_o_error )
<< '\t' << pct_o_and_o_error
<< endl;
}
out.close();
}
// DotplotOrderAccuracy: Make a QuickDotplot of the ordering, highlighting contigs that are mis-ordered. Uses the output of EvalOrderAccuracy.
void
Reporter::DotplotOrderAccuracy( const int ordering_ID, const bool full_order, const bool plot_interchrom_contigs, const string & file_head ) const
{
RequireReference();
assert( ordering_ID >= 0 && ordering_ID < _N_orderings );
cout << Time() << ": DotplotOrderAccuracy with ordering_ID = " << ordering_ID << ", full_order = " << int(full_order) << "; file_head = " << file_head << endl;
// Pick the ordering to analyze.
const ContigOrdering & scaffold = full_order ? _orders[ordering_ID] : _trunks[ordering_ID];
const OrderingFlags & flags = full_order ? _data->order_flags : _data->trunk_flags;
//scaffold.Print();
// Get the full ordering of the contigs on reference.
vector<int> contig_order = _true_mapping->QueriesToGenomeOrder();