forked from shendurelab/LACHESIS
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmake_bed_around_RE_site.pl
executable file
·199 lines (152 loc) · 7.49 KB
/
make_bed_around_RE_site.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
#!/usr/bin/perl -w
use strict;
#///////////////////////////////////////////////////////////////////////////////
#// //
#// This software and its documentation are copyright (c) 2014-2015 by Joshua //
#// N. Burton and the University of Washington. All rights are reserved. //
#// //
#// THIS SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS //
#// OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF //
#// MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND NON-INFRINGEMENT. //
#// IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY //
#// CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT //
#// OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR //
#// THE USE OR OTHER DEALINGS IN THE SOFTWARE. //
#// //
#///////////////////////////////////////////////////////////////////////////////
# make_bed_around_restriction_site.pl: Make a BED file representing the regions around all occurrences of a restriction site.
#
# For syntax, run with no arguments.
#
# The output BED file is designed for use with bedtools intersect, as follows:
# bedtools intersect -abam [SRR.bam] -b [$BED_out] > [SRR.REduced.bam]
# samtools view -h [SRR.REduced.bam] > [SRR.REduced.sam]
# This restricts a SAM/BAM file to only include reads close to a restriction site, which is a good way to filter Hi-C data, according to Fig. 1b of this paper:
# http://www.nature.com/ng/journal/v43/n11/full/ng.947.html
# Also see PreprocessSAM.pl, which uses the output file.
#
# Josh Burton
# April 2013
if ( scalar @ARGV != 3 ) {
# Report syntax.
print "\nmake_bed_around_RE_site.pl\n\n";
print "Find all occurrences of a motif in a genome. Make a 'POS' file listing these occurrences, and also a BED file representing the regions around these occurrences.\n\n";
print "SYNTAX:\tmake_bed_around_RE_site.pl <fasta> <motif> <range>\n";
print "fasta:\tA fasta file representing a genome (reference or draft assembly.)\n";
print "motif:\tA motif, typically a restriction site sequence (e.g., HindIII = AAGCTT, NcoI = CCATGG, Dpn1 = GATC).\n";
print "range:\tA number representing how many bp around the sequence to include. Recommend 500 based on Yaffe & Tanay, Nat. Genetics 2011.\n\n";
print "OUTPUT FILES:\n";
print "<fasta>.near_<motif>.<range>.bed\n";
print "<fasta>.near_pos_of_<motif>.txt\n";
print "\n";
exit;
}
# Get command-line arguments.
my ( $FASTA_in, $motif_seq, $range ) = @ARGV;
my $verbose = 0;
# Convert the motif from a string into a regex. Unroll the IUPAC codes from single letters into Perl-parseable regular expressions.
my $motif_regex = $motif_seq;
$motif_regex =~ s/R/\[AG\]/g;
$motif_regex =~ s/Y/\[CT\]/g;
$motif_regex =~ s/S/\[CG\]/g;
$motif_regex =~ s/W/\[AT\]/g;
$motif_regex =~ s/K/\[GT\]/g;
$motif_regex =~ s/M/\[AC\]/g;
$motif_regex =~ s/B/\[CGT\]/g;
$motif_regex =~ s/D/\[AGT\]/g;
$motif_regex =~ s/H/\[ACT\]/g;
$motif_regex =~ s/V/\[ACG\]/g;
$motif_regex =~ s/N/\[ACGT\]/g;
# Derive an output filename.
my $BED_out = "$FASTA_in.near_$motif_seq.$range.bed";
my $POS_out = "$FASTA_in.pos_of_$motif_seq.txt";
# Determine how many letters needed to be added to each line in order to find instances of the sequence that bridge lines in the fasta.
my $N_prev_chars = length($motif_seq) - 1;
my $contig_name = '';
my $offset = 0;
my $prev_chars;
my @motif_positions;
my $N_motifs_found = 0;
# Open the input fasta file and read through it line-by-line.
print localtime() . ": Reading file $FASTA_in...\n";
open IN, '<', $FASTA_in or die "Can't find file `$FASTA_in'";
open BED, '>', $BED_out or die;
open POS, '>', $POS_out or die;
while (<IN>) {
my $line = $_;
chomp $line;
# If this is a header line, we're done with this contig/chromosome (unless we just started), and start a new contig/chromosome.
if ( $line =~ /^\>(\S+)/ ) {
# The hash %motif_positions contains all positions on the (now complete) old contig at which this motif appears.
# Convert this list of positions to a set of BED lines, as necessary.
my ( $prev_start, $prev_end ) = (-1,-1);
foreach my $pos ( @motif_positions ) {
if ( $prev_end == -1 ) {
$prev_start = $pos;
$prev_end = $pos;
}
if ( $prev_end + 2*$range < $pos ) {
$prev_start = $range if $prev_start < $range;
$prev_end = $offset - $range if $prev_end > $offset - $range; # prevent overflow past the end of the contig/chromosome
print BED "$contig_name\t", $prev_start - $range, "\t", $prev_end + $range, "\n";
$prev_start = $pos;
}
#print "pos = $pos\n";
$prev_end = $pos;
}
# Print the final BED line for this contig/chromosome.
if (@motif_positions) {
$prev_start = $range if $prev_start < $range;
$prev_end = $offset - $range if $prev_end > $offset - $range; # prevent overflow past the end of the contig/chromosome
print BED "$contig_name\t", $prev_start - $range, "\t", $prev_end + $range, "\n";
}
# Get the new contig's name.
$contig_name = $1;
print localtime() . ": $contig_name\n" if $verbose;
print POS ">$contig_name\n";
# Reset other contig-related variables.
$offset = 0;
$prev_chars = '';
@motif_positions = ();
}
# Otherwise, read through this contig/chromosome.
else {
if ( $offset != 0 ) { die unless $prev_chars; }
my $verbose = 0;
# Look for instances of this motif in this line of the fasta (including the overlap characters from the previous line, tacked on at the beginning.)
my $motif_loc = -1;
my $target_str = "$prev_chars" . uc $line;
my @matches;
while ($target_str =~ /$motif_regex/g ) {
# Every iteration in this loop represents a new match to the motif regex in the terget string.
my $motif_loc = $-[0];
# Adjust the location so it properly describes the 0-indexed motif position in this contig.
# Then add it to the list of contig positions at which the motif has been seen.
$N_motifs_found++;
my $true_motif_loc = $motif_loc + $offset - length $prev_chars; # adjust index so it properly describes the 0-indexed motif position in this contig
push @motif_positions, $true_motif_loc;
print "$contig_name\t$offset\t$prev_chars\t->\t$motif_loc\n" if $verbose;
print POS "$true_motif_loc\n";
}
# TODO: remove
while (0) {
$motif_loc = index "$prev_chars$line", $motif_seq, $motif_loc + 1;
last if ( $motif_loc == -1 ); # no more instances found
# Found a motif! Add its index to the list of contig positions at which the motif has been seen.
$N_motifs_found++;
my $true_motif_loc = $motif_loc + $offset - length $prev_chars; # adjust index so it properly describes the 0-indexed motif position in this contig
push @motif_positions, $true_motif_loc;
print "$contig_name\t$offset\t$prev_chars\t->\t$motif_loc\n" if $verbose;
print POS "$true_motif_loc\n";
}
# Save the last few characters of this line, so that they can be appended onto the next line in a search for the sequence.
my $line_len = length $line;
$prev_chars = substr( $line, $line_len - $N_prev_chars );
$offset += $line_len;
}
}
close IN;
close BED;
close POS;
print localtime() . ": Done! Found $N_motifs_found total instances of motif $motif_seq. Created files:\n";
print "$BED_out\n$POS_out\n";