-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathm8_to_mapLen_hist.cpp
executable file
·109 lines (91 loc) · 2.68 KB
/
m8_to_mapLen_hist.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
#include <map>
#include <iostream>
#include <vector>
#include <string>
#include <algorithm>
#include <zlib.h>
#include "kmseq.h"
using namespace std;
typedef long long LL;
int getCoverage(vector<pair<LL, LL> > &v, int n) {
if (n == 0) return 0;
sort(v.begin(), v.begin() + n);
auto intv = v[0];
LL cov = 0;
for (int i = 1; i < n; ++i) {
if (v[i].first <= intv.second) {
intv.second = max(v[i].second, intv.second);
} else {
cov += intv.second - intv.first + 1;
intv = v[i];
}
}
cov += intv.second - intv.first + 1;
return cov;
}
int main(int argc, char **argv) {
if (argc > 1) {
cerr << "Usage: " << argv[0] << "[ <ref.fa> <contig.fa> ] < in.m8" << endl;
return 2;
}
map<string, int> QLen, TLen;
bool calcAvg = false;
if (argc >= 3) {
calcAvg = true;
gzFile fp = gzopen(argv[1], "r");
kmseq<gzFile> seqT(fp);
while (seqT.read() >= 0) {
TLen[seqT.name()] = seqT.seq().length();
}
gzclose(fp);
fp = gzopen(argv[2], "r");
kmseq<gzFile> seqQ(fp);
while (seqQ.read() >= 0) {
QLen[seqQ.name()] = seqQ.seq().length();
}
gzclose(fp);
}
map<string, vector<pair<LL, LL> > > intervals;
map<string, vector<pair<LL, int> > > queryAlgnmtLen; // first: len, second: index
map<string, vector<int> > queryID;
string qid, sid, lastq;
double identity;
LL algn_len, mm, gaps, qs, qe, ss, se;
double e_value, bit_score;
while (cin >> qid >> sid
>> identity >> algn_len
>> mm >> gaps >> qs >> qe
>> ss >> se >> e_value >> bit_score) {
if (lastq != qid) {
if (ss > se) { swap(ss, se); }
intervals[sid].push_back(make_pair(ss, se));
queryAlgnmtLen[sid].push_back(make_pair(abs(qe - qs + 1), (int)queryAlgnmtLen[sid].size()));
queryID[sid].push_back(qid);
lastq = qid;
}
}
for (auto it = queryAlgnmtLen.begin(); it != queryAlgnmtLen.end(); ++it) {
sort(it->second.rbegin(), it->second.rend());
cout << "Target: " << it->first;
if (calcAvg) { cout << TLen[it->first]; }
cout << endl;
vector<pair<LL, LL> > v;
double totalMappingLen = 0, totalLength = 0;
int NC50 = 0;
for (int i = 0; i < it->second.size(); ++i) {
v.push_back(intervals[it->first][it->second[i].second]);
totalLength += QLen[queryID[sid][it->second[i].second]];
totalMappingLen += it->second[i].first;
cout << it->second[i].first << '\t'
<< QLen[queryID[sid][it->second[i].second]] << '\t'
<< double(it->second[i].first) / QLen[queryID[sid][it->second[i].second]] << '\t'
<< getCoverage(v, i+1) << endl;
if (NC50 == 0 && totalMappingLen >= .5 * TLen[sid]) {
NC50 = it->second[i].first;
}
}
cout << "Avg mapping len: " << totalMappingLen / totalLength
<< "\tNC50: " << NC50 << endl;
}
return 0;
}