-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathsplitFasta.pl
executable file
·92 lines (76 loc) · 1.53 KB
/
splitFasta.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
#!/usr/bin/env perl
=head1 Author
Dinghua Li <[email protected]>
=head1 Usage
splitFasta.pl [options] <in.fa>
=head1 Options
-c <float> Chunk size [3.9e9]
-p <str> Output prefix [in.fa]
=cut
use warnings;
use strict;
use Getopt::Long;
my $chunk_size = 3.9e9;
my $prefix = undef;
GetOptions(
"c=f" => \$chunk_size,
"p=s" => \$prefix
);
unless (@ARGV >= 1) {
die `pod2text $0`;
}
my $fa_fn = $ARGV[0];
unless (defined($prefix)) {
if ($fa_fn =~ /^(\S+)\.gz/) {
$prefix = $1;
} else {
$prefix = $fa_fn;
}
}
my $file_idx = 0;
my $out_file;
my $acc_size = 0;
my $seq_size = 0;
open($out_file, ">", $prefix.".".$file_idx);
split_fa();
close($out_file);
sub split_fa {
my $in;
if ($fa_fn eq "-") {
$in = *STDIN;
} elsif ($fa_fn =~ /^(\S+)\.gz/) {
open($in, "gzip -cd $fa_fn |") or die "cannot open $fa_fn";
} else {
open($in, "<", "$fa_fn") or die "cannot open $fa_fn";
}
my $name = undef;
my @seqs = undef;
while (my $line = <$in>) {
chomp $line;
if ($line =~ /^>/) {
write_seq($name, @seqs) if defined($name);
$name = $line;
@seqs = ();
$seq_size = 0;
} else {
$acc_size += length($line);
push(@seqs, $line);
}
}
write_seq($name, @seqs) if defined($name);
close($in) unless $fa_fn eq "-";
}
sub write_seq {
my ($name, @seqs) = @_;
$acc_size += $seq_size;
if ($acc_size > $chunk_size) {
$acc_size = $seq_size;
$file_idx++;
close($out_file);
open($out_file, ">", $prefix.".".$file_idx);
}
print $out_file $name."\n";
foreach my $seq (@seqs) {
print $out_file $seq."\n";
}
}