-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathread_merger.pl
executable file
·164 lines (151 loc) · 4.29 KB
/
read_merger.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
#!/usr/bin/perl
# Copyright 2013-2015, Derrick Wood <[email protected]>
#
# This file is part of the Kraken taxonomic sequence classification system.
#
# Kraken is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Kraken is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Kraken. If not, see <http://www.gnu.org/licenses/>.
# Merges two files specified on the command line, print to stdout
# Designed to be called by kraken
use strict;
use warnings;
use File::Basename;
use Getopt::Long;
my $PROG = basename $0;
my $fasta_input = 0;
my $fastq_input = 0;
my $gunzip = 0;
my $bunzip2 = 0;
my $check_names = 0;
GetOptions(
"fa" => \$fasta_input,
"fq" => \$fastq_input,
"gz" => \$gunzip,
"bz2" => \$bunzip2,
"check-names" => \$check_names
);
if (@ARGV != 2) {
die "$PROG: must have exactly two filename arguments\n";
}
for my $file (@ARGV) {
if (! -e $file) {
die "$PROG: $file does not exist\n";
}
if (! -f $file) {
die "$PROG: $file is not a regular file\n";
}
}
my $compressed = $gunzip || $bunzip2;
if ($gunzip && $bunzip2) {
die "$PROG: can't use both gzip and bzip2 compression flags\n";
}
if (! ($fasta_input xor $fastq_input)) {
die "$PROG: must specify exactly one of FASTA and FASTQ input flags\n";
}
# Open files (or pipes from decompression progs)
my ($fh1, $fh2);
if ($gunzip) {
open $fh1, "-|", "gunzip", "-c", $ARGV[0]
or die "$PROG: can't open gunzip pipe with $ARGV[0]: $!\n";
open $fh2, "-|", "gunzip", "-c", $ARGV[1]
or die "$PROG: can't open gunzip pipe with $ARGV[1]: $!\n";
}
elsif ($bunzip2) {
open $fh1, "-|", "bunzip2", "-c", $ARGV[0]
or die "$PROG: can't open bunzip2 pipe with $ARGV[0]: $!\n";
open $fh2, "-|", "bunzip2", "-c", $ARGV[1]
or die "$PROG: can't open bunzip2 pipe with $ARGV[1]: $!\n";
}
else {
open $fh1, "<", $ARGV[0]
or die "$PROG: can't open $ARGV[0]: $!\n";
open $fh2, "<", $ARGV[1]
or die "$PROG: can't open $ARGV[1]: $!\n";
}
# read/merge/print loop
# make sure names match before merging
my ($seq1, $seq2);
while (defined($seq1 = read_sequence($fh1))) {
$seq2 = read_sequence($fh2);
if (! defined $seq2) {
die "$PROG: mismatched sequence counts\n";
}
if ($check_names && $seq1->{id} ne $seq2->{id}) {
die "$PROG: mismatched mate pair names ('$seq1->{id}' & '$seq2->{id}')\n";
}
print_merged_sequence($seq1, $seq2);
}
if (defined($seq2 = read_sequence($fh2))) {
die "$PROG: mismatched sequence counts\n";
}
close $fh1;
close $fh2;
{
my %buffers; # Needed due to fasta formatting
sub read_sequence {
my $fh = shift;
my $id;
my $seq = "";
if (! exists $buffers{$fh}) {
$buffers{$fh} = <$fh>;
}
if (! defined $buffers{$fh}) { # No more file
return undef;
}
if ($fasta_input) {
if ($buffers{$fh} =~ /^>(\S+)/) {
$id = $1;
}
else {
die "$PROG: malformed fasta file (line = '$buffers{$fh}')\n";
}
delete $buffers{$fh};
while (<$fh>) {
if (/^>/) {
$buffers{$fh} = $_;
last;
}
else {
chomp;
$seq .= $_;
}
}
}
elsif ($fastq_input) {
if ($buffers{$fh} =~ /^@(\S+)/) {
$id = $1;
}
else {
if ($buffers{$fh} =~ /^$/) {
return undef; # Some fastq files end with blank line, allow it
}
die "$PROG: malformed fastq file (line = '$buffers{$fh}')\n";
}
delete $buffers{$fh};
chomp($seq = <$fh>);
scalar <$fh>; # quality header
scalar <$fh>; # quality values
}
else {
# should never get here
die "$PROG: I have no idea what kind of input I'm reading!!!\n";
}
$id =~ s/[\/_.][12]$//; # strip /1 (or .1, _1) or /2 to help comparison
return { id => $id, seq => $seq };
}
}
sub print_merged_sequence {
my ($seq1, $seq2) = @_;
print ">" . $seq1->{id} . "\n";
print $seq1->{seq} . "N" . $seq2->{seq} . "\n";
}