-
Notifications
You must be signed in to change notification settings - Fork 8
/
Copy pathbigwigaverageoverbed.py
245 lines (214 loc) · 9.63 KB
/
bigwigaverageoverbed.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
import io
import os
import subprocess
from typing import Dict, Optional, Tuple, Union
import numpy as np
import pandas as pd
def get_track_id_to_filename_dict(
tracks_dir: str,
tracks_list_filename: str,
partial: Optional[Tuple[int, int]] = None,
) -> (Dict[str, str], Dict[str, int]):
"""
Create track ID to bigWig track file mapping.
:param tracks_dir:
Directory with bigWig track files.
:param tracks_list_filename:
File with track IDs.
:param partial: (current_part, nbr_total_parts)
Divide the track list in a number of total parts (of similar size) and return only the part defined by
current_part. This makes it easier to create partial cisTarget motifs or tracks vs regions or genes scores
database on machines which do not have enough RAM to score all motifs in one iteration, while still being able
to give the same list of tracks to each instance.
:return: track_id_to_filename_dict:
track ID to bigWig filename mapping.
"""
track_id_to_filename_dict = dict()
# Create track ID to bigWig track filename mapping.
with open(tracks_list_filename, "r") as fh:
for line in fh:
track_id = line.rstrip()
if track_id and not track_id.startswith("#"):
if track_id.endswith(".bw"):
track_filename = os.path.join(tracks_dir, track_id)
# Remove ".bw" extension from track ID.
track_id = track_id[:-3]
elif track_id.lower().endswith(".bigwig"):
track_filename = os.path.join(tracks_dir, track_id)
# Remove ".bigwig" extension from track ID.
track_id = track_id[:-7]
else:
# If we got a track ID, try to find the corresponding bigWig file.
track_filenames = [
os.path.join(tracks_dir, track_id + bigwig_suffix)
for bigwig_suffix in [".bw", ".bigWig", "bigwig"]
]
for track_filename in track_filenames:
if os.path.exists(track_filename):
break
if not os.path.exists(track_filename):
raise OSError(
f'Error: bigWig track ID filename "{track_filename}" does not exist for track {track_id}.'
)
else:
track_id_to_filename_dict[track_id] = track_filename
# Sort track IDs.
track_ids_sorted = sorted(track_id_to_filename_dict.keys())
if partial:
current_part, nbr_total_parts = partial
if nbr_total_parts < 1:
raise ValueError(
f'"nbr_total_parts" ({nbr_total_parts}) of partial argument should be >= 1.'
)
if current_part < 1:
raise ValueError(
f'"current_part" ({current_part}) of partial argument should be >= 1.'
)
if current_part > nbr_total_parts:
raise ValueError(
f'"current_part" ({current_part}) of partial argument should be <= "nbr_total_parts" '
f"({nbr_total_parts})."
)
# Get partial track IDs list for current requested part of the track IDs.
# If this function is run with a different current_part number, each partial track IDs list should have
# a similar number of tracks.
partial_track_ids_list = [
track_ids_sorted[i]
for i in range(
current_part - 1, len(track_id_to_filename_dict), nbr_total_parts
)
]
# Recreate dictionaries with track IDs sorted by track ID.
track_id_to_filename_dict = {
track_id: track_id_to_filename_dict[track_id]
for track_id in partial_track_ids_list
}
else:
# Recreate dictionaries with track IDs sorted by track ID.
track_id_to_filename_dict = {
track_id: track_id_to_filename_dict[track_id]
for track_id in track_ids_sorted
}
return track_id_to_filename_dict
def run_bigwig_average_over_bed_for_track(
bigwig_average_over_bed_path: str,
bed_filename: str,
bigwig_filename: str,
track_id: str,
extract_gene_id_from_region_id_regex_replace: Optional[str] = None,
scoring_column: str = "max",
ssh_command: Optional[Union[str, list]] = None,
) -> Tuple[str, pd.DataFrame]:
"""
Score each region in the BED file with bigWigAverageOverBed over the bigWig file
and only keep the top track score per region ID/gene ID.
:param bigwig_average_over_bed_path: Path to bigWigAverageOverBed binary.
:param bed_filename: BED filename with regions to score.
:param bigwig_filename: BigWig filename from which to extract the score for each region in the BED filename.
:param track_id: Track ID.
:param extract_gene_id_from_region_id_regex_replace:
Define a regex which will remove the non-gene part of the region ID of each sequence
name in the FASTA file, so only the gene ID remains. If set to None the whole region ID
will be kept instead. In case of region IDs, the best track score per region is kept.
In case of gene IDs, the best track score from multiple regions is kept.
:param scoring_column: Scoring column to use from bigWigAverageOverBed output (default: "max").
:param ssh_command: If defined, run bigWigAverageOverBed over ssh by running the provided command to make the
connection before running Cluster-Buster.
Example : 'ssh -o ControlMaster=auto -o ControlPath=/tmp/ssh-control-path-%l-%h-%p-%r -o ControlPersist=600 <hostname>'
:return: (track_id, df_track_scores): track ID and dataframe with top track score per region/gene ID.
"""
bigwig_average_over_bed_command = []
if ssh_command:
# Add SSH command to the start of the Cluster-Buster command.
if isinstance(ssh_command, str):
bigwig_average_over_bed_command.extend(ssh_command.split())
elif isinstance(ssh_command, list):
bigwig_average_over_bed_command.extend(ssh_command)
# Construct Cluster-Buster command line.
bigwig_average_over_bed_command.extend(
[
bigwig_average_over_bed_path,
"-minMax",
bigwig_filename,
bed_filename,
"/dev/stdout",
]
)
# Score each region in FASTA file with Cluster-Buster for the provided motif and get top CRM score for each region.
try:
pid = subprocess.Popen(
args=bigwig_average_over_bed_command,
bufsize=-1,
executable=None,
stdin=None,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
preexec_fn=None,
close_fds=False,
shell=False,
cwd=None,
env=None,
universal_newlines=False,
startupinfo=None,
creationflags=0,
)
stdout_data, stderr_data = pid.communicate()
except OSError as msg:
raise RuntimeError(
"Execution error for: '"
+ " ".join(bigwig_average_over_bed_command)
+ "': "
+ str(msg)
)
if pid.returncode != 0:
raise RuntimeError(
"Error: Non-zero exit status for: '" + " ".join(bigwig_average_over_bed_command) + "'"
)
# Read bigWigAverageOverBed standard out as a pandas dataframe.
df_track_scores = pd.read_csv(
filepath_or_buffer=io.BytesIO(stdout_data),
sep="\t",
header=None,
names=[
"name", # name field from bed, which should be unique
"size", # size of bed (sum of exon sizes
"covered", # bases within exons covered by bigWig
"sum", # sum of values over all bases covered
"mean0", # average over bases with non-covered bases counting as zeroes
"mean", # average over just covered bases
"min", # minimum coverage value
"max", # maximum coverage value
],
index_col="name",
usecols=["name", scoring_column],
dtype={"name": str, scoring_column: np.float32},
engine="c",
)
df_track_scores.rename(columns={scoring_column: "track_score"}, inplace=True)
if extract_gene_id_from_region_id_regex_replace:
# Extract gene ID from the region ID by removing the non-gene part.
#
# Take the top track score for each gene ID by taking the maximum track
# score of all region IDs that belong to the same gene ID.
#
# Examples:
# - extract_gene_id_from_region_id_regex_replace = '#[0-9]+$'
# - region ID (input):
# "geneA#1", "geneB#1", "geneC#1", "geneA#2", "geneC#2"
# - gene IDs (output):
# "geneA", "geneB", "geneC"
# - extract_gene_id_from_region_id_regex_replace = '^.+@@'
# - region IDs (input):
# "region1@@geneA", "region2@@geneB", "region3@@geneA"
# - gene IDs (output):
# "geneA", "geneB"
df_track_scores = (
df_track_scores.assign(
gene_ids=df_track_scores.index.str.replace(
extract_gene_id_from_region_id_regex_replace, "", regex=True
)
)
.groupby("gene_ids")
.max()
)
return track_id, df_track_scores